利用百度构造拼音词表[python]

做中文查询纠错,需要一个拼音到中文词汇的对应表。网上找不到,只要利用baidu来构建一个。

原理:

难点是多音字。输入一个词,根据拼音汉字表,可以得到其搜有发音,比如“银行”的发音有可能有“yinhang”,“yinxing”,哪一个才是对的呢?

发送GET请求到tobaidu.com,

"http://www.baidu.com/s?bs=zhongguo&f=8&wd=yinhang"

从搜索结果可以得到,

您要找的是不是:银行,

百度一下,找到相关网页约170,000篇,用时0.070秒

这样就可以知道”yinhang“是”银行“正确的发音。

步骤

1.利用拼音汉字表,构造汉字->拼音hash表。

2.对词表的每一个词汇,得到所有的拼音候选。如果只有一个,就是得到一个结果。

如果有多个发音,到步骤3。

3.发送GET请求到tobaidu.com,"http://www.baidu.com/s?bs=zhongguo&f=8&wd=*****"(候选拼音)

4.分析返回结果,得到正确的发音。

注意,由于发送GET到baidu太慢,采用用200个线程,

下面是python代码

#!/usr/bin/python

importsys

importchardet

importre

importhttplib

fromurllib2importRequest,HTTPError,URLError,urlopen

fromhttplibimportInvalidURL

importurlparse,re,urllib,logging,StringIO,logging

importhtmllib

fromos.pathimportjoin,getsize

importthreading

verbose=False

#编码转换

deftranscode(s):

try:

encode_dict=chardet.detect(s)

ifencode_dict["encoding"]=="GB2312":

s=unicode(s,"gb18030").encode("utf-8")

ifencode_dict["encoding"]=="gbk":

s=unicode(s,"gb18030").encode("utf-8")

ifisinstance(s,unicode):

s=s.encode('utf-8')

returns

except:

returns

pinyin_map=dict();

#构造拼音词汇表,因为拼音词汇表格式可能不一样,所以省略

defgenerate_pinyin_map():

#省略

defgenerate_synonym_worker(input_file,out_file,idx):

f=open(input_file+"_"+str(idx),'r')

of=open(out_file+"_"+str(idx),'w+')

of.seek(0,2)

idx=0

foreachlineinf:

idx+=1

try:

eachline=unicode(eachline,'utf-8')

eachline=eachline[:-1]

words=re.split(u'+',eachline)

pinyin=generate_pinyin_canadiate(words[0],of)

except:

pass

f.close()

of.close()

#两百个线程

num_worker=200

#map

defsplit(input_file):

f=open(input_file,'r')

files=[]

foridxinrange(num_worker):

files.append(open(input_file+"_"+str(idx),'w'))

idx=0

foreachlineinf:

idx+=1

files[idx%num_worker].write(eachline)

foronefileinfiles:

onefile.close()

#reduce

defmerge(out_file):

outf=open(out_file,'w+')

files=[]

foridxinrange(num_worker):

file_name=out_file+"_"+str(idx)

f=open(file_name,'r')

foreachlineinf:

outf.write(eachline)

f.close()

outf.close

#产生拼音词汇表

defgenerate_synonym(input_file,out_file):

split(input_file)

thread_pool=[]

foriinrange(num_worker):

th=threading.Thread(target=generate_synonym_worker,args=(input_file,out_file,i));

thread_pool.append(th)

#startthreadsonebyone

foriinrange(num_worker):

thread_pool[i].start()

#collectallthreads

foriinrange(num_worker):

threading.Thread.join(thread_pool[i])

merge(out_file)

#枚举所有的拼音组合

deflistlist_enumerate(listlist,count):

result=[]

ifcount<=0:

returnresult

ifcount==1:

foriteminlistlist[0]:

aa=[]

aa.append(item)

result.append(aa)

else:

pre_result=listlist_enumerate(listlist,count-1)

result=[]

foroneinpre_result:

foriteminlistlist[count-1]:

bb=list(one)

bb.append(item)

result.append(bb)

returnresult

defmakeutf8(s):

#ESCAPE=re.compile(r'[\x00-\x17\\"\b\f\n\r\t]')

ESCAPE=re.compile(r'[\x00-\x1f]')

ESCAPE_DCT={}

foriinrange(32):

ESCAPE_DCT.setdefault(chr(i),'')

#ESCAPE_DCT.setdefault(chr(i),'\\%02x'%(i,))

#ESCAPE_DCT.setdefault(chr(i),'^%c'%(ord('@')+i))

defreplace(match):

returnESCAPE_DCT[match.group(0)]

s=ESCAPE.sub(replace,s)

ifisinstance(s,unicode):

s=s.encode('utf-8')

returns

defgenerate_pinyin_canadiate(chinese_word,fd):

listlist=[]

forucharinchinese_word:

ifpinyin_map.has_key(uchar)andlen(pinyin_map[uchar]):

listlist.append(pinyin_map[uchar])

else:

listlist.append(uchar)

listlist=listlist_enumerate(listlist,len(listlist))

foralistinlistlist:

canadiate=''.join(alist)

canadiate=canadiate.lower()

printcanadiate

query=""

try:

query=urllib.urlencode({'wd':canadiate})

except:

pass

url='http://www.baidu.com/s?%s'%(query)

search_results=urllib.urlopen(url).read()

search_results=transcode(search_results)

printurl,canadiate#,search_results

r_suggest=re.compile('<ahref="s\?wd=(.*)&f=12&oq='%dict(pinyin=canadiate),re.I|re.S)

re_match=r_suggest.findall(search_results)

ifre_match:

try:

s=urllib.unquote(re_match[0])

s=unicode(s,'gb18030')

fd.write(canadiate+""+makeutf8(s)+"\n")

fd.flush()

ifs==chinese_word:

print"onehit:",canadiate,chinese_word

#fd.write(canadiate+""+makeutf8(chinese_word)+"\n")

fd.flush()

returncanadiate

except:

pass

returnNone

defrun(input_file,out_file):

generate_pinyin_map()

generate_synonym(input_file,out_file)

if1and__name__=="__main__":

args=sys.argv[1:]

iflen(args)<2:

print"USage:./synonym.py<input_wordlist><output_file>"

else:

run(args[0],args[1]);