行文件分组统计

Ali

2012-03-02

关注关注

有些情况下，对于一个结构化的以行为记录的文本文件，需要按列分组统计，如果数据量小，可以直接导入数据库中，但是当文件很大时，导入数据库不太现实，本程序即实现非数据库条件下，按任意列分组统计行数功能；文件只读一次，按任意分组方式查询。

基本思路：

1.根据指定的列名，构建一颗多叉树，树的高度即为可以分组的条件列数

2.存储树中，各节点名按字典顺序降序排列

3.查询时，根据指定的列名，找到对应的树节点，将其中value值累加返回

以下为第一个初级版本，欢迎指点!!

树节点：

package org.jf.sta;

import java.util.ArrayList;
import java.util.List;

public class SegNode 
{
	private String name;
	private String id;
	private int value;
	private List<SegNode> childList;
	
	public SegNode()
	{
		childList = new ArrayList<SegNode>();
	}
	
	public SegNode(String name,String id)
	{
		this();
		this.name = name;
		this.id = id;
//		childList = new ArrayList<SegNode>();
	}
	public String getName() {
		return name;
	}

	public String getId() {
		return id;
	}
	
	public int getValue() {
		return value;
	}
	public void setValue(int value) {
		this.value = value;
	}
	
	public void addValue(int increment)
	{
		this.value += increment;
	}

	public List<SegNode> getChildList()
	{
		return this.childList;
	}
	
	public void addChild(SegNode node)
	{
		this.childList.add(node);
	}
	
	public String toXml()
	{
		
		String s="<"+name+" id=\""+this.id+"\" value=\""+this.value+"\">\n";
		for(int i=0;i<this.childList.size();i++)
		{
			s = s+childList.get(i).toXml(" ");
		}
		s+="</"+name+">\n";
		return s;
	}
	
	public String toXml(String blank)
	{
		String s="";
		if(childList.size()==0)
		{
			s=blank+"<"+name+" id=\""+this.id+"\" value=\""+this.value+"\"/>\n";
		}else
		{
			s=blank+"<"+name+" id=\""+this.id+"\" value=\""+this.value+"\">\n";
			for(int i=0;i<this.childList.size();i++)
			{
				s = s+blank+childList.get(i).toXml(blank+" ");
			}
			s+=blank+"</"+name+">\n";
		}
		
		return s;
	}
}

统计树

package org.jf.sta;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;


/**
 * 行文件统计工具类
 * 行文件 :每行一条记录  字段间用分隔符 默认为一到多个空白字符 
 * 
 * 一次读取，任意分组
 * 
 * @author junfeng.chen
 *
 */
public class LineStaTree {
	
    private SegNode root;
    private Map<String,Integer> internalColumnMap;
    private String[] columnNames;
    private Map<String,Integer> columnMap;
	private String seprator="\\s+";
	
	
	public LineStaTree(Map<String,Integer> column_map)
	{
		this.columnMap = column_map;
		this.columnNames = new String[column_map.size()];
		int i=0;
		for(String column_name:column_map.keySet())
		{
			columnNames[i++]=column_name;
		}
		Arrays.sort(columnNames);
		internalColumnMap = new HashMap<String,Integer>();
		root = new SegNode();
		for(i=0;i<columnNames.length;i++)
		{
			internalColumnMap.put(columnNames[i], i);
		}
		
	}
	
	public void load(InputStream is)
	{
		BufferedReader br = null;
		try{
			br = new BufferedReader(new InputStreamReader(is));
			String str = br.readLine();
			while(str!=null)
			{
				addLine(str);
			}
		}catch(IOException e)
		{
			e.printStackTrace();
		}
		
	}
	
	public void load(File file)
	{
		InputStream is;
		try {
			if(!file.exists()||file.isDirectory())
				throw new RuntimeException("illegle argumet");
			is = new FileInputStream(file);
			this.load(is);
			is.close();
		} catch (Exception e) 
		{
			e.printStackTrace();
		}
	
	}
	
	public void setSeprator(String sep)
	{
		seprator = sep;
	}
	
	public void addLine(String line)
	{
		String ss[] = line.split(seprator);
		if(ss.length<columnNames.length)
			return;
		root.addValue(1);
		this.add(root, ss, 0);
	}
	
	private void add(SegNode parent,String []ss,int column_index)
	{
		
		List<SegNode> nodeList = parent.getChildList();
		String columnName = columnNames[column_index];
		int index = columnMap.get(columnName);
		String id=ss[index];
		SegNode node = null;
		if(nodeList.size()==0)
		{
		    node = new SegNode(columnName,id);
			node.addValue(1);
			nodeList.add(node);
		}else
		{
			for(int i=0;i<nodeList.size();i++)
			{
				node = nodeList.get(i);
				if(id!=null)
				{
					if(id.equals(node.getId()))
						break;
				}
			}
			if(node==null||!id.equals(node.getId()))
			{
				node = new SegNode(columnName,id);
				nodeList.add(node);
			}
			node.addValue(1);
			
		}
	
		if(column_index==columnNames.length-1)
			return;
		
		add(node,ss,column_index+1);
	}
	
	
	/***
	 * 1.计算出起始节点的层序号和终点节点的层序号 start end
	 * 2.移动到第一个统计节点
	 * 3.遍历 end-start次
	 * 4.获取末节点id 与首节点id组成串 put进hashmap
	 * 
	 * 
	 * 
	 */
	
	//获取第 n 层子节点
	private List<SegNode> getChildList(SegNode parent,int count,List<SegNode> list)
	{
		if(list==null)
			list = new ArrayList<SegNode>();
		{
			List<SegNode> sonList = parent.getChildList();
			if(count<=0)
			{
				list.addAll(sonList);
			}else
			{
				SegNode node = null;
				for(int i=0;i<sonList.size();i++)
				{
					node = sonList.get(i);
					getChildList(node,count-1,list);
				}
			}
		}
		return list;
	}
	
	//移动到末节点的路径 Map<String,Integer> //id1$$id2$$id3 然后将对应位置的id置为空字符串
	//相同key的数据累加
	//获取首节点列表
	//获取末节点列表
	//从首节点开始遍历，直到end结束  记录id组成的路径 
	private Map<String,Integer> getCount(int  begin,int end,String []columns)
	{
		List<SegNode> beginList = this.getChildList(root, begin, null);
		SegNode beginNode = null;
		Map<String,Integer> result_map = new HashMap<String,Integer>(); 
		if(begin==end)
		{
			for(int i=0;i<beginList.size();i++)
			{
				beginNode = beginList.get(i);
				Integer intg = result_map.get(beginNode.getId());
				if(intg==null)
					intg = new Integer(beginNode.getValue());
				else
					intg=intg+beginNode.getValue();
				result_map.put(beginNode.getId(), intg);
			}
			return result_map;
		}
		for(int i=0;i<beginList.size();i++)
		{
			beginNode = beginList.get(i);
			travle(beginNode, end-begin, beginNode.getId(), result_map,columns);
		}
		return result_map;
	}
	
	public Map<String,Integer> groupBy(String[] columns)
	{
		if(columns==null||columns.length==0)
		{
			Map<String,Integer> map = new HashMap<String,Integer>();
			map.put("*", root.getValue());
			return map;
		}
		Arrays.sort(columns);
		int startIndex = this.internalColumnMap.get(columns[0]);//节点层 序号
		int endIndex = this.internalColumnMap.get(columns[columns.length-1]);
		
		String queryColumns[] = new String[this.columnNames.length];
		for(int i=0;i<columnNames.length;i++)//全部置为*
		{
			queryColumns[i]="*";
		}
		for(int i=0;i<columns.length;i++)//将本次查询的条件列 置入其中
		{
			queryColumns[this.internalColumnMap.get(columns[i])]=columns[i];
		}
		String queryColumns2 [] = new String[endIndex-startIndex+1];
		System.arraycopy(queryColumns, startIndex, queryColumns2, 0, queryColumns2.length);
		Map<String,Integer> result = this.getCount(startIndex, endIndex, queryColumns2);
		return result;
	}
	
	
	/**
	 * 
	 * @param beginNode
	 * @param steps
	 * @param path
	 * @param map
	 * @param columns
	 * @return
	 */
	private  Map<String,Integer> travle(
										SegNode beginNode,
										int steps,
										String parentpath,
										Map<String,Integer> map,
										String [] columns)
	{
		if(parentpath==null)
			parentpath="";
		if(map==null)
			map = new HashMap<String,Integer>();
		if(steps<=1)
		{
			List<SegNode> list = beginNode.getChildList();
			SegNode node = null;
			for(int i=0;i<list.size();i++)
			{
				node = list.get(i);
				String path = parentpath;
				if(columns[columns.length-steps].equals("*"))
					path = path+"_*";//提前设置 跨层节点为空
				else
					path = path+"_"+node.getId();
				Integer intg = map.get(path);
				if(intg!=null)
					intg=intg+node.getValue();
				else
					intg = new Integer(node.getValue());
				
				map.put(path, intg);
			}
			
		}else 
		{
			List<SegNode> list = beginNode.getChildList();
			SegNode node = null;
			String path = parentpath;
			for(int i=0;i<list.size();i++)
			{
				node = list.get(i);
				if(columns[columns.length-steps].equals("*"))
					map = travle(node, steps-1, path+"_*", map,columns);
				else
					map = travle(node, steps-1, path+"_"+node.getId(), map,columns);
			}
		}
		return map;
	}
	
	
	
	public String toXml()
	{
		String s="<records id=\"*\" value=\""+root.getValue()+"\">\n";
		List<SegNode> list = root.getChildList();
		for(int i=0;i<list.size();i++)
		{
			s+=list.get(i).toXml(" ");
		}
		s+="</records>\n";
		return s;
	}
	
	public static void main(String args[])
	{
		String ss[] = new String[]{
		 "abc 123 234",
		 "bcd 123 234",
		 "abc 123 345",
		 "abc 123 456",
		 "bcd 123 345",
		 "bcdd 1d23 3s45",
		};
		Map<String,Integer> map = new HashMap<String,Integer>();
		map.put("tag1", 0);
		map.put("tag2", 1);
		map.put("tag3", 2);
		LineStaTree tree = new LineStaTree(map);
		for(String s:ss)
		{
			tree.addLine(s);
		}
		Map<String,Integer> map1 = tree.groupBy(new String[]{"tag1","tag2"});//tree.getCount(0,0,new String[]{"tag1"});
		System.out.println(map1.size());
		Set<String> keys = map1.keySet();
		for(String key:keys)
		{
			System.out.println(key+": "+map1.get(key));
		}
		System.out.println(tree.toXml());
		System.out.println(tree.getChildList(tree.root, 2, null).size());
		
	}
	
	
	
}

数据库文件