User:R. Hillgentleman/kwokYue.py

//zh-yue.wikipedia.org/w/index.php?title=User:R._Hillgentleman/kwokYue.py&diff=cur
#kwokYue7.py
#demo: http://zh-yue.wikipedia.org/w/index.php?title=Wikipedia:%E6%B2%99%E7%9B%92&diff=110555&oldid=110553
#重要做嘅嘢：要避開書名號《》、引號＂＂、「」、""、“”
 
 
import re 
import wikipedia
site=wikipedia.getSite()
 
page=wikipedia.Page(site, u'wikipedia:沙盒')
oldtext = page.get()
text=oldtext
 
#(?<!《||"「|“|)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*
#[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*(?!》|＂|」|”)
 
si  = re.compile(ur'(?<!《|"|「|“|)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*(?<!為|國|於)是(?!但|非|為|故)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*(?!》|＂|」|”)',flags=re.U)  #自以為是 - must (?) match fixed length
 
bud = re.compile(ur'(?<!《|"|「|“|)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*不(?!過|屈|惜|朽|治|測)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*(?!》|＂|」|”)',flags=re.U)
 
dig = re.compile(ur'(?<!《|"|「|“|)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*(?<!目|中|麗)的(?!確|而)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*(?!》|＂|」|”)',flags=re.U)
 
text = si.sub(ur'係', text, count=0)  #count=0 : replace all
text =bud.sub(ur'唔', text )
text =dig.sub(ur'嘅', text )
 
page.put(text, u'kwokYue7.py 機械人改字：(?<!《||"「|“|)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*(?<!為|國|於)是(?!但|非|為|故)[\u3400-\u4DB5\u4E00-\u9FA5\uF900-\uFA2D\u20000-\u2A6D6\u2F800-\u2FA1D]*(?!》|＂|」|”) -> 係 ; 不,的 etc')
 
wikipedia.stopme()
 

 ###################################################################################################3
#commented out crap - converted into a for-loop...
#newtext = wikipedia.replaceExcept(newtext, u'是', u'係',[u'是但',u'是非',u'自以為是',u'國是'], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'不', u'唔',[], caseInsensitive=True)      
#newtext = wikipedia.replaceExcept(newtext, u'的', u'嘅',[u'目的',u'的士',u'一矢中的',u'的確',u'的而且確'], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'們', u'哋',[], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'他', u'佢',[u'其他',u'他山',], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'她', u'佢',[], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'它', u'佢',[], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'您', u'你',[], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'在', u'喺',[u'現在',u'存在',u'所在',], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'今天', u'今日',[], caseInsensitive=True)
#newtext = wikipedia.replaceExcept(newtext, u'明天', u'聽日',[], caseInsensitive=True)
#########################################################################################
#''list'' is a list of lists of length three  (oldtext, newtext, exceptions)
# where oldtext and newtext are unicodestrings; exceptions is a list of strings
# perhaps  use ur'[^(自以為|國)]是[^(但|非)]' ?? understand regex better!
# use the sub method?  r=regex object, r1=replacement, s = string,  r.sub(r1,string,count=0)
"""
list=[
 [ur'是', ur'係',[ur'(?<!是)但',u'是非',u'自以為是',u'國是']],
 [u'不', u'唔',[]],
 [u'的', u'嘅',[u'目的',u'的士',u'一矢中的',u'的確',u'的而且確',u'麗的']],
 [u'美麗的',u'好靚嘅',[]],
 [u'們', u'哋',[]],
 [u'他', u'佢',[u'其他',u'他山',u'維他']],
 [u'她', u'佢',[]],
 [u'它', u'佢',[]],
 [u'您', u'你',[]],
 [u'在於',u'喺',[u'現在',u'存在',u'所在',u'此在'] ],
 [u'在', u'喺',[u'現在',u'存在',u'所在',u'在乎',u'此在']],
 [u'今天',u'今日',[]],
 [u'明天',u'聽日',[]],
 [u'衛',u'衞',[]],
 [u'了',u'咗',[u'了解',u'了結',u'了了',u'未了',u'不了']],
 [u'也',u'都',[]],
 [u'這',u'呢',[]],
 [u'此',u'呢',[]],
 [u'牠',u'佢',[]],
 [u'沒',u'無',[u'沉沒',u'覆沒',u'沒頂',u'沒入',u'沒落']],
 [u'没',u'無',[u'沉没',u'覆没',u'没頂',u'没入',u'没落']],
 [u'及',u'同',[u'埃及',u'及時',u'及第']],
 [u'與',u'同',[u'參與',u'與會',u'與共']],
 [u'和',u'同',[u'共和',u'和平',u'大和',u'昭和',u'和坤',u'和氣',u'和顏',u'和聲',u'和弦',u'和絃',u'調和',u'和樂',u'和好',u'協和',u'唱和',u'一和',u'和諧']],
 [u'並',u'重',[u'並且',u'並蒂',u'並重']],
 [u'吃',u'食',[]],
 [u'些',u'啲',[]],
 [u'看',u'睇',[u'看更',u'看守',u'看家',u'看門']],
 [u'那裏',u'嗰度',[]],
 [u'那兒',u'嗰度',[]],
 [u'哪裏',u'邊度',[]],
 [u'那',u'嗰',[u'那麼']],
 [u'這裏',u'呢度',[]],
 [u'這兒',u'呢度',[]],
 [u'從',u'由',[u'從人',u'隨從',u'服從',u'順從',u'從心']],
 [u'逝世',u'死',[]],
 [u'般',u'樣',[]],
 [u'這樣',u'咁樣',[]],
 [u'衛',u'衞',[]],
 [u'說',u'講',[u'小說',u'說文',u'說書',u'解說',u'說明']],
 [u'像',u'似',[u'成像',u'影像',u'圖像',u'印像']],
] 
k=[]
for i,j,k in list:
  print i,j,k
  ######################################################################
  # need to write from scratch the regex 
  # text = wikipedia.replaceExcept(text, i,j,k, caseInsensitive=True)
  ##################################################################### 
page.put(newtext,u'機械人轉字：國語,[[user:hillgentleman1]]')
 
wikipedia.output(text)
wikipedia.stopme()
 
""" 
 
#################################################################################### 
"""
some crap
import wikipedia #importing the wikipedia.py module
site=wikipedia.getSite()  # setting the site, from configuration
pg = wikipedia.Page(site, 'template:copyvio') #creating the Page object in question
x=pg.contributingUsers()  #getting the contributing users
"""