Erlang实战建立文本索引
为文本建立索引是文本信息处理的一个重要的任务,给定一个由英文单词构成的文件,为文件中所有单词建立索引,记录每个单词出现的行号和每行出现的次数,并将索引存入一个文件。在Erlang实战练习(六)中我强调了当时建立文本索引的时候太粗糙,一是使用了尽量规避的进程字典的方式;二是分词使用的是正则表达式,不够灵活。本文将改进我以前建立文本索引的方式,使用ETS来存储单词及其索引列表,同时拆分词使用Erlang提供的string:token模块,更加灵活和可移植性。
word_index.erl文件的总体结构如下:
-module(word_index).
-export([start/2]).
-import(re, [run/2,replace/4]).
-import(string,[substr/3]).
%% start两个参数:FileIn表示要建立索引的文本文件,FileOut表示索引保存的目标文件start(FileIn,FileOut) ->
{_First,Second} = file:open(FileIn,read),%% 只读打开FileIn文件
if
_First =:= ok ->
LineList = readFile(Second,0),%% 函数readFile/2的功能是将文本以行为单位,存入列表
%io:format("~nfile contents:~p~n",[LineList]),
TableID = ets:new(index,[ordered_set]),%% ets:new创建一个“键值”搜索表,
存储键值映射元祖,设置表名为index,表的类型为ordered_set
index(FileOut,LineList,TableID);%% 为文本中的每一行建立单词索引
_First =/= ok ->
io:format("Open file error: file doesn't exist!")
end. readFile/2函数代码如下:
%% 读取文本每一行,以{Line,LineNo}为元组存入列表中readFile(S, LineNo) -> readFile(S,LineNo,[]).
readFile(S, LineNo, Ret) ->
UpdateLineNo = LineNo +1,
OneLine = io:get_line(S,''),%% 读取文件中的一行内容 if
OneLine =:= eof ->
io:format("Read file EOF!"),
file:close(S),
lists:reverse(Ret);
OneLine =/= eof ->
readFile(S,UpdateLineNo, [{OneLine,UpdateLineNo} | Ret])
end. index/3函数代码如下:
index(File,LineList,TableID) ->
if
length(LineList) =:= 0 ->
ToList = ets:tab2list(TableID),
io:format("index is:~n~p~n",[ToList]),
writeToFile(File,ToList),
io:format("create index success! ");
length(LineList) =/= 0 ->
First = lists:nth(1,LineList),
processOneLine(First,TableID),
index(File,lists:delete(First, LineList), TableID)
end.
%% 处理一行文本processOneLine(OneLine, TableID) ->
{Element, LineNo } = OneLine,
%io:format("Line no:~p~n",[LineNo]),
Words = string:tokens(Element,"\n\t "),
matchWords(Words,LineNo,TableID).
matchWords([], LineNo, TableID) ->
io:format("process line(~p) success!~n",[LineNo]);
matchWords(Words, LineNo, TableID) ->
%io:format("Words:~p~n",[Words]),
Word = lists:nth(1,Words),
_Value = ets:lookup(TableID,Word),%%返回值为匹配Word的元组列表 if
length(_Value) =:= 0 -> %% Word还未被索引,直接插入此Word索引 ets:insert(TableID,{Word,[{LineNo,1}]} );
length(_Value) =/= 0 -> %% Word已被索引,更新Word索引列表 KVs = lists:nth(1,_Value),
Value = element(2,KVs),
ets:insert(TableID,{Word, insertRec(Value,LineNo) } )
end,
matchWords(lists:delete(Word, Words), LineNo, TableID).
%% 处理行号与出现次数元组列表insertRec(List,LineNo) -> insertRec(List,LineNo,length(List)).
insertRec(List, LineNo, 0) ->
[{LineNo, 1} |List];
insertRec(List, LineNo, Ret) ->
First = lists:nth(Ret,List),
{LN, Num} = First,
if
LN =:= LineNo ->
Temp = lists:delete(First, List),
[{LineNo, Num+1} | Temp];
LN =/= LineNo ->
insertRec(List, LineNo, Ret-1)
end.
%% 将索引写入文件writeToFile(File,ToList) ->
{ok,S} = file:open(File,write),
lists:foreach(fun(X) -> io:format(S,"~p.~n",[X]) end, ToList),
file:close(S). 相关推荐
chenpro 2020-08-09
NVEFLY 2020-07-04
liym 2020-06-21
OnMyHeart 2020-06-06
天空windy 2020-06-03
87447007 2020-05-16
OnMyHeart 2020-05-09
NVEFLY 2020-04-17
M守护神 2020-03-28
大史哥哥 2020-03-07
wbingyang 2020-02-27
liym 2020-02-22
zhoucheng0 2020-02-19
wbingyang 2020-02-14
OnMyHeart 2020-01-14
OnMyHeart 2020-01-08
大史哥哥 2019-12-31
wbingyang 2019-12-31