利用百度构造拼音词表[python]
做中文查询纠错,需要一个拼音到中文词汇的对应表。网上找不到,只要利用baidu来构建一个。
原理:
难点是多音字。输入一个词,根据拼音汉字表,可以得到其搜有发音,比如“银行”的发音有可能有“yinhang”,“yinxing”,哪一个才是对的呢?
发送GET请求到tobaidu.com,
"http://www.baidu.com/s?bs=zhongguo&f=8&wd=yinhang"
从搜索结果可以得到,
您要找的是不是:银行,
百度一下,找到相关网页约170,000篇,用时0.070秒
这样就可以知道”yinhang“是”银行“正确的发音。
步骤
1.利用拼音汉字表,构造汉字->拼音hash表。
2.对词表的每一个词汇,得到所有的拼音候选。如果只有一个,就是得到一个结果。
如果有多个发音,到步骤3。
3.发送GET请求到tobaidu.com,"http://www.baidu.com/s?bs=zhongguo&f=8&wd=*****"(候选拼音)
4.分析返回结果,得到正确的发音。
注意,由于发送GET到baidu太慢,采用用200个线程,
下面是python代码
#!/usr/bin/python
importsys
importchardet
importre
importhttplib
fromurllib2importRequest,HTTPError,URLError,urlopen
fromhttplibimportInvalidURL
importurlparse,re,urllib,logging,StringIO,logging
importhtmllib
fromos.pathimportjoin,getsize
importthreading
verbose=False
#编码转换
deftranscode(s):
try:
encode_dict=chardet.detect(s)
ifencode_dict["encoding"]=="GB2312":
s=unicode(s,"gb18030").encode("utf-8")
ifencode_dict["encoding"]=="gbk":
s=unicode(s,"gb18030").encode("utf-8")
ifisinstance(s,unicode):
s=s.encode('utf-8')
returns
except:
returns
pinyin_map=dict();
#构造拼音词汇表,因为拼音词汇表格式可能不一样,所以省略
defgenerate_pinyin_map():
#省略
defgenerate_synonym_worker(input_file,out_file,idx):
f=open(input_file+"_"+str(idx),'r')
of=open(out_file+"_"+str(idx),'w+')
of.seek(0,2)
idx=0
foreachlineinf:
idx+=1
try:
eachline=unicode(eachline,'utf-8')
eachline=eachline[:-1]
words=re.split(u'+',eachline)
pinyin=generate_pinyin_canadiate(words[0],of)
except:
pass
f.close()
of.close()
#两百个线程
num_worker=200
#map
defsplit(input_file):
f=open(input_file,'r')
files=[]
foridxinrange(num_worker):
files.append(open(input_file+"_"+str(idx),'w'))
idx=0
foreachlineinf:
idx+=1
files[idx%num_worker].write(eachline)
foronefileinfiles:
onefile.close()
#reduce
defmerge(out_file):
outf=open(out_file,'w+')
files=[]
foridxinrange(num_worker):
file_name=out_file+"_"+str(idx)
f=open(file_name,'r')
foreachlineinf:
outf.write(eachline)
f.close()
outf.close
#产生拼音词汇表
defgenerate_synonym(input_file,out_file):
split(input_file)
thread_pool=[]
foriinrange(num_worker):
th=threading.Thread(target=generate_synonym_worker,args=(input_file,out_file,i));
thread_pool.append(th)
#startthreadsonebyone
foriinrange(num_worker):
thread_pool[i].start()
#collectallthreads
foriinrange(num_worker):
threading.Thread.join(thread_pool[i])
merge(out_file)
#枚举所有的拼音组合
deflistlist_enumerate(listlist,count):
result=[]
ifcount<=0:
returnresult
ifcount==1:
foriteminlistlist[0]:
aa=[]
aa.append(item)
result.append(aa)
else:
pre_result=listlist_enumerate(listlist,count-1)
result=[]
foroneinpre_result:
foriteminlistlist[count-1]:
bb=list(one)
bb.append(item)
result.append(bb)
returnresult
defmakeutf8(s):
#ESCAPE=re.compile(r'[\x00-\x17\\"\b\f\n\r\t]')
ESCAPE=re.compile(r'[\x00-\x1f]')
ESCAPE_DCT={}
foriinrange(32):
ESCAPE_DCT.setdefault(chr(i),'')
#ESCAPE_DCT.setdefault(chr(i),'\\%02x'%(i,))
#ESCAPE_DCT.setdefault(chr(i),'^%c'%(ord('@')+i))
defreplace(match):
returnESCAPE_DCT[match.group(0)]
s=ESCAPE.sub(replace,s)
ifisinstance(s,unicode):
s=s.encode('utf-8')
returns
defgenerate_pinyin_canadiate(chinese_word,fd):
listlist=[]
forucharinchinese_word:
ifpinyin_map.has_key(uchar)andlen(pinyin_map[uchar]):
listlist.append(pinyin_map[uchar])
else:
listlist.append(uchar)
listlist=listlist_enumerate(listlist,len(listlist))
foralistinlistlist:
canadiate=''.join(alist)
canadiate=canadiate.lower()
printcanadiate
query=""
try:
query=urllib.urlencode({'wd':canadiate})
except:
pass
url='http://www.baidu.com/s?%s'%(query)
search_results=urllib.urlopen(url).read()
search_results=transcode(search_results)
printurl,canadiate#,search_results
r_suggest=re.compile('<ahref="s\?wd=(.*)&f=12&oq='%dict(pinyin=canadiate),re.I|re.S)
re_match=r_suggest.findall(search_results)
ifre_match:
try:
s=urllib.unquote(re_match[0])
s=unicode(s,'gb18030')
fd.write(canadiate+""+makeutf8(s)+"\n")
fd.flush()
ifs==chinese_word:
print"onehit:",canadiate,chinese_word
#fd.write(canadiate+""+makeutf8(chinese_word)+"\n")
fd.flush()
returncanadiate
except:
pass
returnNone
defrun(input_file,out_file):
generate_pinyin_map()
generate_synonym(input_file,out_file)
if1and__name__=="__main__":
args=sys.argv[1:]
iflen(args)<2:
print"USage:./synonym.py<input_wordlist><output_file>"
else:
run(args[0],args[1]);