行文件分组统计
有些情况下,对于一个结构化的以行为记录的文本文件,需要按列分组统计,如果数据量小,可以直接导入数据库中,但是当文件很大时,导入数据库不太现实,本程序即实现非数据库条件下,按任意列分组统计行数功能;文件只读一次,按任意分组方式查询。
基本思路:
1.根据指定的列名,构建一颗多叉树,树的高度即为可以分组的条件列数
2.存储树中,各节点名按字典顺序降序排列
3.查询时,根据指定的列名,找到对应的树节点,将其中value值累加返回
以下为第一个初级版本,欢迎指点!!
树节点:
package org.jf.sta; import java.util.ArrayList; import java.util.List; public class SegNode { private String name; private String id; private int value; private List<SegNode> childList; public SegNode() { childList = new ArrayList<SegNode>(); } public SegNode(String name,String id) { this(); this.name = name; this.id = id; // childList = new ArrayList<SegNode>(); } public String getName() { return name; } public String getId() { return id; } public int getValue() { return value; } public void setValue(int value) { this.value = value; } public void addValue(int increment) { this.value += increment; } public List<SegNode> getChildList() { return this.childList; } public void addChild(SegNode node) { this.childList.add(node); } public String toXml() { String s="<"+name+" id=\""+this.id+"\" value=\""+this.value+"\">\n"; for(int i=0;i<this.childList.size();i++) { s = s+childList.get(i).toXml(" "); } s+="</"+name+">\n"; return s; } public String toXml(String blank) { String s=""; if(childList.size()==0) { s=blank+"<"+name+" id=\""+this.id+"\" value=\""+this.value+"\"/>\n"; }else { s=blank+"<"+name+" id=\""+this.id+"\" value=\""+this.value+"\">\n"; for(int i=0;i<this.childList.size();i++) { s = s+blank+childList.get(i).toXml(blank+" "); } s+=blank+"</"+name+">\n"; } return s; } }
统计树
package org.jf.sta; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.ArrayList; import java.util.Arrays; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Set; /** * 行文件统计工具类 * 行文件 :每行一条记录 字段间用分隔符 默认为一到多个空白字符 * * 一次读取,任意分组 * * @author junfeng.chen * */ public class LineStaTree { private SegNode root; private Map<String,Integer> internalColumnMap; private String[] columnNames; private Map<String,Integer> columnMap; private String seprator="\\s+"; public LineStaTree(Map<String,Integer> column_map) { this.columnMap = column_map; this.columnNames = new String[column_map.size()]; int i=0; for(String column_name:column_map.keySet()) { columnNames[i++]=column_name; } Arrays.sort(columnNames); internalColumnMap = new HashMap<String,Integer>(); root = new SegNode(); for(i=0;i<columnNames.length;i++) { internalColumnMap.put(columnNames[i], i); } } public void load(InputStream is) { BufferedReader br = null; try{ br = new BufferedReader(new InputStreamReader(is)); String str = br.readLine(); while(str!=null) { addLine(str); } }catch(IOException e) { e.printStackTrace(); } } public void load(File file) { InputStream is; try { if(!file.exists()||file.isDirectory()) throw new RuntimeException("illegle argumet"); is = new FileInputStream(file); this.load(is); is.close(); } catch (Exception e) { e.printStackTrace(); } } public void setSeprator(String sep) { seprator = sep; } public void addLine(String line) { String ss[] = line.split(seprator); if(ss.length<columnNames.length) return; root.addValue(1); this.add(root, ss, 0); } private void add(SegNode parent,String []ss,int column_index) { List<SegNode> nodeList = parent.getChildList(); String columnName = columnNames[column_index]; int index = columnMap.get(columnName); String id=ss[index]; SegNode node = null; if(nodeList.size()==0) { node = new SegNode(columnName,id); node.addValue(1); nodeList.add(node); }else { for(int i=0;i<nodeList.size();i++) { node = nodeList.get(i); if(id!=null) { if(id.equals(node.getId())) break; } } if(node==null||!id.equals(node.getId())) { node = new SegNode(columnName,id); nodeList.add(node); } node.addValue(1); } if(column_index==columnNames.length-1) return; add(node,ss,column_index+1); } /*** * 1.计算出起始节点的层序号和终点节点的层序号 start end * 2.移动到第一个统计节点 * 3.遍历 end-start次 * 4.获取末节点id 与首节点id组成串 put进hashmap * * * */ //获取第 n 层子节点 private List<SegNode> getChildList(SegNode parent,int count,List<SegNode> list) { if(list==null) list = new ArrayList<SegNode>(); { List<SegNode> sonList = parent.getChildList(); if(count<=0) { list.addAll(sonList); }else { SegNode node = null; for(int i=0;i<sonList.size();i++) { node = sonList.get(i); getChildList(node,count-1,list); } } } return list; } //移动到末节点的路径 Map<String,Integer> //id1$$id2$$id3 然后将对应位置的id置为空字符串 //相同key的数据累加 //获取首节点列表 //获取末节点列表 //从首节点开始遍历,直到end结束 记录id组成的路径 private Map<String,Integer> getCount(int begin,int end,String []columns) { List<SegNode> beginList = this.getChildList(root, begin, null); SegNode beginNode = null; Map<String,Integer> result_map = new HashMap<String,Integer>(); if(begin==end) { for(int i=0;i<beginList.size();i++) { beginNode = beginList.get(i); Integer intg = result_map.get(beginNode.getId()); if(intg==null) intg = new Integer(beginNode.getValue()); else intg=intg+beginNode.getValue(); result_map.put(beginNode.getId(), intg); } return result_map; } for(int i=0;i<beginList.size();i++) { beginNode = beginList.get(i); travle(beginNode, end-begin, beginNode.getId(), result_map,columns); } return result_map; } public Map<String,Integer> groupBy(String[] columns) { if(columns==null||columns.length==0) { Map<String,Integer> map = new HashMap<String,Integer>(); map.put("*", root.getValue()); return map; } Arrays.sort(columns); int startIndex = this.internalColumnMap.get(columns[0]);//节点层 序号 int endIndex = this.internalColumnMap.get(columns[columns.length-1]); String queryColumns[] = new String[this.columnNames.length]; for(int i=0;i<columnNames.length;i++)//全部置为* { queryColumns[i]="*"; } for(int i=0;i<columns.length;i++)//将本次查询的条件列 置入其中 { queryColumns[this.internalColumnMap.get(columns[i])]=columns[i]; } String queryColumns2 [] = new String[endIndex-startIndex+1]; System.arraycopy(queryColumns, startIndex, queryColumns2, 0, queryColumns2.length); Map<String,Integer> result = this.getCount(startIndex, endIndex, queryColumns2); return result; } /** * * @param beginNode * @param steps * @param path * @param map * @param columns * @return */ private Map<String,Integer> travle( SegNode beginNode, int steps, String parentpath, Map<String,Integer> map, String [] columns) { if(parentpath==null) parentpath=""; if(map==null) map = new HashMap<String,Integer>(); if(steps<=1) { List<SegNode> list = beginNode.getChildList(); SegNode node = null; for(int i=0;i<list.size();i++) { node = list.get(i); String path = parentpath; if(columns[columns.length-steps].equals("*")) path = path+"_*";//提前设置 跨层节点为空 else path = path+"_"+node.getId(); Integer intg = map.get(path); if(intg!=null) intg=intg+node.getValue(); else intg = new Integer(node.getValue()); map.put(path, intg); } }else { List<SegNode> list = beginNode.getChildList(); SegNode node = null; String path = parentpath; for(int i=0;i<list.size();i++) { node = list.get(i); if(columns[columns.length-steps].equals("*")) map = travle(node, steps-1, path+"_*", map,columns); else map = travle(node, steps-1, path+"_"+node.getId(), map,columns); } } return map; } public String toXml() { String s="<records id=\"*\" value=\""+root.getValue()+"\">\n"; List<SegNode> list = root.getChildList(); for(int i=0;i<list.size();i++) { s+=list.get(i).toXml(" "); } s+="</records>\n"; return s; } public static void main(String args[]) { String ss[] = new String[]{ "abc 123 234", "bcd 123 234", "abc 123 345", "abc 123 456", "bcd 123 345", "bcdd 1d23 3s45", }; Map<String,Integer> map = new HashMap<String,Integer>(); map.put("tag1", 0); map.put("tag2", 1); map.put("tag3", 2); LineStaTree tree = new LineStaTree(map); for(String s:ss) { tree.addLine(s); } Map<String,Integer> map1 = tree.groupBy(new String[]{"tag1","tag2"});//tree.getCount(0,0,new String[]{"tag1"}); System.out.println(map1.size()); Set<String> keys = map1.keySet(); for(String key:keys) { System.out.println(key+": "+map1.get(key)); } System.out.println(tree.toXml()); System.out.println(tree.getChildList(tree.root, 2, null).size()); } }
相关推荐
bleach00 2020-11-10
林德强之原创 2020-08-13
PlumRain 2020-08-03
bob于 2020-07-26
dataminer 2020-06-25
neverstopforcode 2020-06-18
zxznsjdsj 2020-06-16
langyue 2020-06-13
huangyx 2020-06-11
鲁氏汤包王 2020-06-11
LWLWLiang 2020-05-28
wkwanglei 2020-05-26
Coohx 2020-04-13
thunderstorm 2020-05-09
Andrea0 2020-05-04
数据库工具开发 2020-04-25
tlsmile 2020-04-24
helencoder 2020-04-10