package edu.harvard.wcfia.yoshikoder;
import java.awt.Desktop;
import java.awt.event.ActionEvent;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Logger;
import javax.swing.JFileChooser;
import javax.swing.JOptionPane;
import javax.swing.filechooser.FileFilter;
import org.apache.poi.hssf.usermodel.HSSFCell;
import org.apache.poi.hssf.usermodel.HSSFRow;
import org.apache.poi.hssf.usermodel.HSSFSheet;
import org.apache.poi.hssf.usermodel.HSSFWorkbook;
import edu.harvard.wcfia.yoshikoder.dictionary.CategoryNode;
import edu.harvard.wcfia.yoshikoder.dictionary.Node;
import edu.harvard.wcfia.yoshikoder.dictionary.YKDictionary;
import edu.harvard.wcfia.yoshikoder.document.YKDocument;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenList;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationCache;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationException;
import edu.harvard.wcfia.yoshikoder.document.tokenizer.TokenizationService;
import edu.harvard.wcfia.yoshikoder.reporting.EntryFrequencyMap;
import edu.harvard.wcfia.yoshikoder.util.DialogUtil;
import edu.harvard.wcfia.yoshikoder.util.FileUtil;
import edu.harvard.wcfia.yoshikoder.util.TaskWorker;
public class UnifiedDictionaryFrequencyReportAction extends YoshikoderAction {
private static Logger log = Logger.getLogger(UnifiedDictionaryFrequencyReportAction.class.getName());
protected JFileChooser chooser = new JFileChooser();
FileFilter csvutf8 = new FileFilter() {
@Override
public String getDescription() {
return "CSV (UTF-8 encoded)";
}
@Override
public boolean accept(File f) {
return f.isDirectory();
}
};
FileFilter excel = new FileFilter() {
@Override
public String getDescription() {
return "MS Excel";
}
@Override
public boolean accept(File f) {
return f.isDirectory();
}
};
public UnifiedDictionaryFrequencyReportAction(Yoshikoder yk) {
super(yk, UnifiedDictionaryFrequencyReportAction.class.getName());
chooser = new JFileChooser();
chooser.removeChoosableFileFilter(chooser.getAcceptAllFileFilter());
chooser.addChoosableFileFilter(csvutf8);
chooser.addChoosableFileFilter(excel);
chooser.setFileFilter(csvutf8);
}
protected int[] getDocumentStats(YKDocument doc, Node[] keys, CategoryNode catnode) throws IOException, TokenizationException {
// tokenize the document
TokenizationCache tcache = yoshikoder.getTokenizationCache();
TokenList tl1 = tcache.getTokenList(doc);
if (tl1 == null)
tl1 = TokenizationService.getTokenizationService().tokenize(doc);
// compute the dictionary counts
EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
int[] counts = new int[keys.length+1];
for (int ii=0; ii<keys.length; ii++) {
Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
counts[ii] = cnt.intValue();
}
// add N
counts[keys.length] = efm1.getTokenTotal();
return counts;
}
protected void writeExcel(List<YKDocument> documents, File file, CategoryNode node) throws Exception {
final List<YKDocument> docs = documents;
final File outputFile = file;
final FileOutputStream stream = new FileOutputStream(outputFile);
final CategoryNode catnode = node;
tworker = new TaskWorker(yoshikoder){
protected void doWork() throws Exception {
// FIRST DOC
YKDocument doc1 = (YKDocument)docs.get(0);
// tokenize the document
TokenizationCache tcache = yoshikoder.getTokenizationCache();
TokenList tl1 = tcache.getTokenList(doc1);
if (tl1 == null)
tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
YKDictionary dict = yoshikoder.getDictionary();
// compute the dictionary counts
EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
List lkeys = efm1.getSortedCategoryEntries();
Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
int[] counts = new int[keys.length+1];
for (int ii=0; ii<keys.length; ii++) {
Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
counts[ii] = cnt.intValue();
}
// add N
counts[keys.length] = efm1.getTokenTotal();
HSSFWorkbook wb = new HSSFWorkbook();
HSSFRow row;
HSSFCell cell;
HSSFSheet sheet = wb.createSheet("Category frequencies");
// header
row = sheet.createRow((short)0);
for (int c=0; c<keys.length; c++){
cell = row.createCell((short)(c+1));
cell.setEncoding(HSSFCell.ENCODING_UTF_16);
String nodepath = efm1.getEntryPath(keys[c]);
cell.setCellValue(nodepath);
}
cell = row.createCell((short)(keys.length+1));
cell.setEncoding(HSSFCell.ENCODING_UTF_16);
cell.setCellValue("Total");
int rownum = 1;
for (Iterator iter = docs.iterator(); iter.hasNext();) {
YKDocument d = (YKDocument) iter.next();
counts = getDocumentStats(d, keys, catnode);
row = sheet.createRow((short)rownum);
cell = row.createCell((short)0);
cell.setEncoding(HSSFCell.ENCODING_UTF_16);
cell.setCellValue(d.getTitle());
for (int ii = 0; ii < keys.length; ii++) {
cell = row.createCell((short)(ii+1));
cell.setCellValue((double)counts[ii]);
}
cell = row.createCell((short)(keys.length+1));
cell.setCellValue(counts[keys.length]);
rownum++;
}
wb.write(stream);
}
protected void onError() {
try {
stream.close();
} catch (Exception ex){
log.info("could not close the file stream");
ex.printStackTrace();
}
if (e instanceof TokenizationException){
DialogUtil.yelp(yoshikoder, "Tokenization Error", e);
} else if (e instanceof IOException){
DialogUtil.yelp(yoshikoder, "Input/Ouput Error", e);
} else {
DialogUtil.yelp(yoshikoder, "Error", e);
}
}
@Override
protected void onSuccess() {
try {
stream.close();
} catch (Exception ex){
ex.printStackTrace();
log.info("could not close the file stream");
}
int resp = JOptionPane.showConfirmDialog(yoshikoder, "Open report file?",
"Open report", JOptionPane.YES_NO_OPTION);
if (resp == JOptionPane.YES_OPTION){
try {
Desktop.getDesktop().open(outputFile);
} catch (Exception ex){
ex.printStackTrace();
}
}
}
};
tworker.start();
}
protected void writeCsvUTF8(List<YKDocument> documents, File file, CategoryNode node) throws Exception {
final List<YKDocument> docs = documents;
final BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),
Charset.forName("UTF8")));
final CategoryNode catnode = node;
tworker = new TaskWorker(yoshikoder){
protected void doWork() throws Exception {
// FIRST DOC
YKDocument doc1 = (YKDocument)docs.get(0);
// tokenize the document
TokenizationCache tcache = yoshikoder.getTokenizationCache();
TokenList tl1 = tcache.getTokenList(doc1);
if (tl1 == null)
tl1 = TokenizationService.getTokenizationService().tokenize(doc1);
YKDictionary dict = yoshikoder.getDictionary();
// compute the dictionary counts
EntryFrequencyMap efm1 = new EntryFrequencyMap(catnode, tl1);
List lkeys = efm1.getSortedCategoryEntries();
Node[] keys = (Node[])lkeys.toArray(new Node[lkeys.size()]);
int[] counts = new int[keys.length+1];
for (int ii=0; ii<keys.length; ii++) {
Integer cnt = (Integer) efm1.getEntryCount(keys[ii]);
counts[ii] = cnt.intValue();
}
// add N
counts[keys.length] = efm1.getTokenTotal();
for (int ii = 0; ii < keys.length; ii++) {
String nodepath = efm1.getEntryPath(keys[ii]);
writer.write(",");
writer.write(FileUtil.escapeForCsv(nodepath));
}
writer.write(",Total\n");
// and the rest
for (Iterator iter = docs.iterator(); iter.hasNext();) {
YKDocument d = (YKDocument) iter.next();
counts = getDocumentStats(d, keys, catnode);
writer.write(FileUtil.escapeForCsv(d.getTitle()));
for (int ii = 0; ii < keys.length; ii++) {
writer.write("," + counts[ii]);
}
writer.write("," + counts[keys.length] + "\n");
}
writer.close();
}
protected void onError() {
try {
writer.close();
} catch (Exception ex){
log.info("could not close the CSV file");
}
if (e instanceof TokenizationException){
DialogUtil.yelp(yoshikoder, "Tokenization Error", e);
} else if (e instanceof IOException){
DialogUtil.yelp(yoshikoder, "Input/Ouput Error", e);
} else {
DialogUtil.yelp(yoshikoder, "Error", e);
}
}
@Override
protected void onSuccess() {
try {
writer.close();
} catch (Exception ex){
log.info("could not close the CSV file");
}
// dont ask because it won't work cross platform
/*
int resp = JOptionPane.showConfirmDialog(yoshikoder, "Open report file?",
"Open report", JOptionPane.YES_NO_OPTION);
if (resp == JOptionPane.YES_OPTION){
try {
Desktop.getDesktop().open(outputFile);
} catch (Exception ex){
ex.printStackTrace();
}
}
*/
}
};
tworker.start();
}
public void actionPerformed(ActionEvent e) {
if (yoshikoder.getProject().getDocumentList().size() > 1){
Node n = yoshikoder.getSelectedNode();
CategoryNode cnode = null;
if (n instanceof CategoryNode)
cnode = (CategoryNode)n;
else // patternnode
cnode = (CategoryNode)n.getParent();
final CategoryNode catnode = cnode;
File file;
try {
int resp = chooser.showSaveDialog(yoshikoder);
if (resp != JFileChooser.APPROVE_OPTION)
return;
file = chooser.getSelectedFile();
if (chooser.getFileFilter().equals(excel)){
if (!file.getName().toLowerCase().endsWith(".xls"))
file = new File(file.getParent(), file.getName() + ".xls");
YKDocument[] docsa = yoshikoder.getSelectedDocuments();
List<YKDocument> docs = new ArrayList<YKDocument>(docsa.length);
for (int ii = 0; ii < docsa.length; ii++)
docs.add(docsa[ii]);
writeExcel(docs, file, catnode);
} else if (chooser.getFileFilter().equals(csvutf8)){
if (!file.getName().toLowerCase().endsWith("-utf8.csv"))
file = new File(file.getParent(), file.getName() + "-utf8.csv");
YKDocument[] docsa = yoshikoder.getSelectedDocuments();
List<YKDocument> docs = new ArrayList<YKDocument>(docsa.length);
for (int ii = 0; ii < docsa.length; ii++)
docs.add(docsa[ii]);
writeCsvUTF8(docs, file, catnode);
}
} catch (Exception ex){
DialogUtil.yelp(yoshikoder, ex.getMessage(), ex);
return;
}
}
}
}