package edu.illinois.lis.utils;
import java.io.FileInputStream;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
public class Stopper {
public static final Pattern SPACE_PATTERN = Pattern.compile(" ", Pattern.DOTALL);
private Set<String> stopwords;
public Stopper() {
stopwords = new HashSet<String>();
}
public Stopper(String pathToStoplist) {
try {
stopwords = new HashSet<String>();
// assume our stoplist has one stopword per line
List<String> lines = IOUtils.readLines(new FileInputStream(pathToStoplist));
Iterator<String> it = lines.iterator();
while(it.hasNext()) {
stopwords.add(it.next());
}
} catch (Exception e) {
e.printStackTrace();
}
}
public String apply(String text) {
StringBuilder b = new StringBuilder();
String[] toks = SPACE_PATTERN.split(text);
for(String tok : toks) {
if(! isStopWord(tok))
b.append(tok + " ");
}
return b.toString().trim();
}
public void addStopword(String term) {
stopwords.add(term);
}
public boolean isStopWord(String term) {
return (stopwords.contains(term)) ? true : false;
}
public Set<String> asSet() {
return stopwords;
}
}