package cc.twittertools.hbase; import java.io.BufferedReader; import java.io.File; import java.io.FileNotFoundException; import java.io.FileReader; import java.io.IOException; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import org.apache.hadoop.hbase.client.HTablePool; import org.apache.hadoop.hbase.util.Bytes; import com.google.common.collect.HashBasedTable; import com.google.common.collect.Table; public class LoadWordCount { public static void main(String[] args) throws IOException { // TODO Auto-generated method stub if(args.length!=1){ System.out.println("invalid argument"); } Table<String, String, WordCountDAO.WordCount> wordCountMap = HashBasedTable.create(); File folder = new File(args[0]); if(folder.isDirectory()){ for (File file : folder.listFiles()) { if(!file.getName().startsWith("part")) continue; System.out.println("Processing "+args[0]+file.getName()); BufferedReader bf = new BufferedReader(new FileReader(args[0]+file.getName())); // each line in wordcount file is like : 1 twitter 100 String line; while((line=bf.readLine())!=null){ String[] groups = line.split("\\t"); if(groups.length != 4) continue; String day = groups[0]; // each day is viewed as a column in underlying HBase String interval = groups[1]; String word = groups[2]; String count = groups[3]; if(!wordCountMap.contains(word, day)){ WordCountDAO.WordCount w = new WordCountDAO.WordCount(word, day); wordCountMap.put(word, day, w); } WordCountDAO.WordCount w = wordCountMap.get(word, day); w.setCount(Integer.valueOf(interval), Integer.valueOf(count)); wordCountMap.put(word, day, w); } } } System.out.println("Total "+wordCountMap.size()+" words"); HTablePool pool = new HTablePool(); WordCountDAO DAO = new WordCountDAO(pool); DAO.CreateTable(); int count = 0; for(WordCountDAO.WordCount w: wordCountMap.values()){ DAO.addWordCount(w); if(++count % 50000==0){ System.out.println("Loading "+count+" words"); } } pool.closeTablePool(DAO.TABLE_NAME); } }