package org.juxtasoftware.resource;
import java.io.IOException;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.TermFreqVector;
import org.apache.lucene.index.TermPositionVector;
import org.apache.lucene.index.TermVectorOffsetInfo;
import org.apache.lucene.queryParser.ParseException;
import org.apache.lucene.queryParser.QueryParser;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopScoreDocCollector;
import org.apache.lucene.search.highlight.QueryTermExtractor;
import org.apache.lucene.search.highlight.WeightedTerm;
import org.juxtasoftware.dao.SourceDao;
import org.juxtasoftware.dao.WitnessDao;
import org.juxtasoftware.model.Source;
import org.juxtasoftware.model.Witness;
import org.restlet.data.Status;
import org.restlet.representation.Representation;
import org.restlet.resource.Get;
import org.restlet.resource.ResourceException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.config.BeanDefinition;
import org.springframework.context.annotation.Scope;
import org.springframework.stereotype.Service;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
/**
* Resource to search documents in a workspace for occurrences of text
*
* @author loufoster
*
*/
@Service
@Scope(BeanDefinition.SCOPE_PROTOTYPE)
public class Searcher extends BaseResource {
private String searchString;
@Autowired private IndexSearcher searcher;
@Autowired private IndexReader indexReader;
@Autowired private Integer hitsPerPage;
@Autowired private SourceDao sourceDao;
@Autowired private WitnessDao witnessDao;
@Autowired private Integer fragSize;
@Autowired private Integer phraseSlop;
@Autowired private QueryParser queryParser;
@Override
protected void doInit() throws ResourceException {
super.doInit();
this.searchString = getQueryValue("q");
if ( this.searchString == null) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST, "Missing search query");
}
this.searchString = this.searchString.trim().replaceAll("\\s+", " ");
}
@Get("json")
public Representation search() {
try {
Map<HitItem, List<HitDetail> > sourceHits = new HashMap<HitItem, List<HitDetail> >();
Map<HitItem, List<HitDetail> > witnessHits = new HashMap<HitItem, List<HitDetail> >();
LOG.info("Search for '"+this.searchString+"'");
// build a phrase quuery to match exact phrase entered
TermQuery wsQuery = new TermQuery( new Term("workspace", this.workspace.getName()) );
TermQuery srcQuery = new TermQuery( new Term("type", "source") );
TermQuery witQuery = new TermQuery( new Term("type", "witness") );
Query phraseQ = this.queryParser.parse("\""+this.searchString.trim()+"\"");
BooleanQuery query = new BooleanQuery();
query.add(wsQuery, Occur.MUST);
query.add(phraseQ, Occur.MUST);
query.add(srcQuery, Occur.MUST);
// do 2 searches, one in source one in witness. this makes sure
// that they are treated equally wrt the top docs score; ie the top
// x docs in both source and witness are returned
// pick the top hits in sources
TopScoreDocCollector collector = TopScoreDocCollector.create(this.hitsPerPage, true);
this.searcher.search(query, collector);
ScoreDoc[] scoreDocs = collector.topDocs(0, this.hitsPerPage).scoreDocs;
List<ScoreDoc> hits = new ArrayList<ScoreDoc>(Arrays.asList(scoreDocs));
// now witnesses
collector = TopScoreDocCollector.create(this.hitsPerPage, true);
query = new BooleanQuery();
query.add(wsQuery, Occur.MUST);
query.add(phraseQ, Occur.MUST);
query.add(witQuery, Occur.MUST);
this.searcher.search(query, collector);
scoreDocs = collector.topDocs(0, this.hitsPerPage).scoreDocs;
hits.addAll(Arrays.asList(scoreDocs));
LOG.info("Search for '"+this.searchString+"' yields "+hits.size()+" raw hits");
WeightedTerm[] terms = QueryTermExtractor.getTerms(phraseQ);
for(ScoreDoc scoreDoc : hits) {
Document doc = this.searcher.doc(scoreDoc.doc);
TermFreqVector tfvector = this.indexReader.getTermFreqVector(scoreDoc.doc, "content");
TermPositionVector tpvector = (TermPositionVector)tfvector;
for ( int tid = 0; tid<terms.length; tid++) {
int termidx = tfvector.indexOf(terms[tid].getTerm());
TermVectorOffsetInfo[] tvoffsetinfo = tpvector.getOffsets(termidx);
for (int j=0;j<tvoffsetinfo.length;j++) {
String itemId = doc.get("itemId");
String name = doc.get("name");
if ( doc.get("type").equals("source")) {
addHit(sourceHits, itemId, name, tvoffsetinfo[j]);
} else {
addHit(witnessHits, itemId, name, tvoffsetinfo[j]);
}
}
}
}
mergeHits(sourceHits);
getSourceFragments(sourceHits);
mergeHits(witnessHits);
getWitnessFragments(witnessHits);
LOG.info("Search for '"+this.searchString+"' end result: "+sourceHits.size()+" source hits, "+witnessHits.size()+" witness hits");
JsonObject json = new JsonObject();
Gson gson = new Gson();
JsonArray jsonSrcs = hitsToJson( sourceHits, gson );
JsonArray jsonWits = hitsToJson( witnessHits, gson );
json.add("sources", jsonSrcs);
json.add("witnesses", jsonWits);
return toTextRepresentation( json.toString() );
} catch (IOException e) {
setStatus(Status.SERVER_ERROR_INTERNAL);
LOG.error("Search failed", e);
return toTextRepresentation("Search Failed");
} catch (ParseException e) {
setStatus(Status.CLIENT_ERROR_BAD_REQUEST);
LOG.error("Invalid search query specified");
return toTextRepresentation("Invalid search query specified");
} finally {
try {
this.searcher.close();
} catch (IOException e) {}
}
}
private void getSourceFragments(Map<HitItem, List<HitDetail>> hits) {
List<HitItem> deadHit = new ArrayList<Searcher.HitItem>();
for ( Entry<HitItem, List<HitDetail>> ent : hits.entrySet() ) {
List<HitDetail> ranges = ent.getValue();
Long srcId = Long.parseLong(ent.getKey().id);
Source src = this.sourceDao.find(this.workspace.getId(), srcId);
if ( src == null) {
LOG.warn("Source "+srcId+" no longer exists");
deadHit.add(ent.getKey());
continue;
}
for ( Iterator<HitDetail> itr = ranges.iterator(); itr.hasNext();) {
Reader srcReader = this.sourceDao.getContentReader(src);
HitDetail detail = itr.next();
float p = (float)detail.getStartOffset() / (float)src.getText().getLength();
detail.percent = Math.round( p * 100.0f);
int start = detail.getStartOffset()-this.fragSize;
start = Math.max(0, start);
int end = detail.getEndOffset()+this.fragSize;
end = Math.min(end, (int)src.getText().getLength());
char[] buf = new char[end-start];
try {
srcReader.skip(start);
srcReader.read(buf, 0, buf.length);
detail.fragment = new String( buf ).trim();
if ( start > 0 ) {
detail.fragment = "..."+detail.fragment;
}
if ( end < src.getText().getLength() ) {
detail.fragment += "...";
}
} catch (IOException e) {
LOG.error("Unable to get fragment for "+src+" range: "+start+", "+end);
}
}
}
// clean out dead stuff from search results
for ( HitItem hi : deadHit) {
hits.remove(hi);
}
}
private void getWitnessFragments(Map<HitItem, List<HitDetail>> hits) {
List<HitItem> deadHit = new ArrayList<Searcher.HitItem>();
for ( Entry<HitItem, List<HitDetail>> ent : hits.entrySet() ) {
List<HitDetail> ranges = ent.getValue();
Long witId = Long.parseLong(ent.getKey().id);
Witness wit = this.witnessDao.find(witId);
if ( wit == null ) {
LOG.warn("Witness "+witId+" no longer exists");
deadHit.add(ent.getKey());
continue;
}
for ( Iterator<HitDetail> itr = ranges.iterator(); itr.hasNext();) {
Reader rdr = this.witnessDao.getContentStream(wit);
HitDetail detail = itr.next();
float p = (float)detail.getStartOffset() / (float)wit.getText().getLength();
detail.percent = Math.round( p * 100.0f);
int start = detail.getStartOffset()-this.fragSize;
start = Math.max(0, start);
int end = detail.getEndOffset()+this.fragSize;
end = Math.min(end, (int)wit.getText().getLength());
char[] buf = new char[end-start];
try {
rdr.skip(start);
rdr.read(buf, 0, buf.length);
detail.fragment = new String( buf ).trim();
if ( start > 0 ) {
detail.fragment = "..."+detail.fragment;
}
if ( end < wit.getText().getLength() ) {
detail.fragment += "...";
}
} catch (IOException e) {
LOG.error("Unable to get fragment for "+wit+" range: "+start+", "+end);
}
}
}
// clean out dead stuff from search results
for ( HitItem hi : deadHit) {
hits.remove(hi);
}
}
private void mergeHits(Map<HitItem, List<HitDetail>> hits ) {
for ( Entry<HitItem, List<HitDetail>> ent : hits.entrySet() ) {
List<HitDetail> ranges = ent.getValue();
Collections.sort(ranges, new Comparator<HitDetail>() {
@Override
public int compare(HitDetail a, HitDetail b) {
if ( a.getStartOffset() < b.getStartOffset() ) {
return -1;
} else if ( a.getStartOffset() > b.getStartOffset() ) {
return 1;
} else {
if ( a.getEndOffset() < b.getEndOffset() ) {
return -1;
} else if ( a.getEndOffset() > b.getEndOffset() ) {
return 1;
}
}
return 0;
}
});
// merge adjacent into a single range
TermVectorOffsetInfo lastRange = null;
for ( Iterator<HitDetail> itr = ranges.iterator(); itr.hasNext();) {
TermVectorOffsetInfo currRange = itr.next();
if ( lastRange != null ) {
if ( lastRange.getEndOffset()+this.phraseSlop >= currRange.getStartOffset() ) {
lastRange.setEndOffset( currRange.getEndOffset() );
itr.remove();
continue;
}
}
lastRange = currRange;
}
// toss anything thats not the same len as the search str
for ( Iterator<HitDetail> itr = ranges.iterator(); itr.hasNext();) {
TermVectorOffsetInfo currRange = itr.next();
int len = currRange.getEndOffset() - currRange.getStartOffset();
if ( len < this.searchString.length() ) {
itr.remove();
}
}
}
for ( Iterator<Entry<HitItem, List<HitDetail>>> itr = hits.entrySet().iterator(); itr.hasNext();) {
Entry<HitItem, List<HitDetail>> ent = itr.next();
if ( ent.getValue().size() == 0) {
itr.remove();
}
}
}
private JsonArray hitsToJson(Map<HitItem, List<HitDetail>> hits, Gson gson) {
JsonArray jsonArray = new JsonArray();
for (Entry<HitItem, List<HitDetail>> ent : hits.entrySet() ) {
HitItem info = ent.getKey();
JsonObject obj = new JsonObject();
obj.addProperty("id", info.id);
obj.addProperty("name", info.name);
JsonArray jsonHits = new JsonArray();
for (HitDetail detail : ent.getValue() ) {
JsonObject ob = new JsonObject();
ob.addProperty("start", Integer.toString(detail.getStartOffset()));
ob.addProperty("end", Integer.toString(detail.getEndOffset()));
ob.addProperty("percent", detail.percent);
ob.addProperty("fragment", detail.fragment);
jsonHits.add(ob);
}
obj.add("hits", jsonHits);
jsonArray.add(obj);
}
return jsonArray;
}
private int addHit(Map<HitItem, List<HitDetail>> hitMap, String itemId, String name, TermVectorOffsetInfo range) {
HitItem hit = new HitItem(itemId, name);
if ( hitMap.containsKey(hit) == false) {
hitMap.put(hit, new ArrayList<HitDetail>() );
}
hitMap.get(hit).add( new HitDetail(range) );
return hitMap.get(hit).size();
}
@SuppressWarnings("serial")
private static class HitDetail extends TermVectorOffsetInfo {
public String fragment;
public int percent;
public HitDetail ( TermVectorOffsetInfo inf ) {
super();
this.setEndOffset(inf.getEndOffset());
this.setStartOffset(inf.getStartOffset());
this.fragment = "";
this.percent = 0;
}
}
private static class HitItem {
public final String name;
public final String id;
public HitItem( String id, String name) {
this.id = id;
this.name = name;
}
@Override
public String toString() {
return "HitInfo ["+this.id+" "+this.name+"]";
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((id == null) ? 0 : id.hashCode());
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
HitItem other = (HitItem) obj;
if (id == null) {
if (other.id != null)
return false;
} else if (!id.equals(other.id))
return false;
if (name == null) {
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
return true;
}
}
}