package org.juxtasoftware.resource; import java.io.IOException; import java.io.Reader; import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.Comparator; import java.util.HashMap; import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Map.Entry; import org.apache.lucene.document.Document; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermFreqVector; import org.apache.lucene.index.TermPositionVector; import org.apache.lucene.index.TermVectorOffsetInfo; import org.apache.lucene.queryParser.ParseException; import org.apache.lucene.queryParser.QueryParser; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.highlight.QueryTermExtractor; import org.apache.lucene.search.highlight.WeightedTerm; import org.juxtasoftware.dao.SourceDao; import org.juxtasoftware.dao.WitnessDao; import org.juxtasoftware.model.Source; import org.juxtasoftware.model.Witness; import org.restlet.data.Status; import org.restlet.representation.Representation; import org.restlet.resource.Get; import org.restlet.resource.ResourceException; import org.springframework.beans.factory.annotation.Autowired; import org.springframework.beans.factory.config.BeanDefinition; import org.springframework.context.annotation.Scope; import org.springframework.stereotype.Service; import com.google.gson.Gson; import com.google.gson.JsonArray; import com.google.gson.JsonObject; /** * Resource to search documents in a workspace for occurrences of text * * @author loufoster * */ @Service @Scope(BeanDefinition.SCOPE_PROTOTYPE) public class Searcher extends BaseResource { private String searchString; @Autowired private IndexSearcher searcher; @Autowired private IndexReader indexReader; @Autowired private Integer hitsPerPage; @Autowired private SourceDao sourceDao; @Autowired private WitnessDao witnessDao; @Autowired private Integer fragSize; @Autowired private Integer phraseSlop; @Autowired private QueryParser queryParser; @Override protected void doInit() throws ResourceException { super.doInit(); this.searchString = getQueryValue("q"); if ( this.searchString == null) { setStatus(Status.CLIENT_ERROR_BAD_REQUEST, "Missing search query"); } this.searchString = this.searchString.trim().replaceAll("\\s+", " "); } @Get("json") public Representation search() { try { Map<HitItem, List<HitDetail> > sourceHits = new HashMap<HitItem, List<HitDetail> >(); Map<HitItem, List<HitDetail> > witnessHits = new HashMap<HitItem, List<HitDetail> >(); LOG.info("Search for '"+this.searchString+"'"); // build a phrase quuery to match exact phrase entered TermQuery wsQuery = new TermQuery( new Term("workspace", this.workspace.getName()) ); TermQuery srcQuery = new TermQuery( new Term("type", "source") ); TermQuery witQuery = new TermQuery( new Term("type", "witness") ); Query phraseQ = this.queryParser.parse("\""+this.searchString.trim()+"\""); BooleanQuery query = new BooleanQuery(); query.add(wsQuery, Occur.MUST); query.add(phraseQ, Occur.MUST); query.add(srcQuery, Occur.MUST); // do 2 searches, one in source one in witness. this makes sure // that they are treated equally wrt the top docs score; ie the top // x docs in both source and witness are returned // pick the top hits in sources TopScoreDocCollector collector = TopScoreDocCollector.create(this.hitsPerPage, true); this.searcher.search(query, collector); ScoreDoc[] scoreDocs = collector.topDocs(0, this.hitsPerPage).scoreDocs; List<ScoreDoc> hits = new ArrayList<ScoreDoc>(Arrays.asList(scoreDocs)); // now witnesses collector = TopScoreDocCollector.create(this.hitsPerPage, true); query = new BooleanQuery(); query.add(wsQuery, Occur.MUST); query.add(phraseQ, Occur.MUST); query.add(witQuery, Occur.MUST); this.searcher.search(query, collector); scoreDocs = collector.topDocs(0, this.hitsPerPage).scoreDocs; hits.addAll(Arrays.asList(scoreDocs)); LOG.info("Search for '"+this.searchString+"' yields "+hits.size()+" raw hits"); WeightedTerm[] terms = QueryTermExtractor.getTerms(phraseQ); for(ScoreDoc scoreDoc : hits) { Document doc = this.searcher.doc(scoreDoc.doc); TermFreqVector tfvector = this.indexReader.getTermFreqVector(scoreDoc.doc, "content"); TermPositionVector tpvector = (TermPositionVector)tfvector; for ( int tid = 0; tid<terms.length; tid++) { int termidx = tfvector.indexOf(terms[tid].getTerm()); TermVectorOffsetInfo[] tvoffsetinfo = tpvector.getOffsets(termidx); for (int j=0;j<tvoffsetinfo.length;j++) { String itemId = doc.get("itemId"); String name = doc.get("name"); if ( doc.get("type").equals("source")) { addHit(sourceHits, itemId, name, tvoffsetinfo[j]); } else { addHit(witnessHits, itemId, name, tvoffsetinfo[j]); } } } } mergeHits(sourceHits); getSourceFragments(sourceHits); mergeHits(witnessHits); getWitnessFragments(witnessHits); LOG.info("Search for '"+this.searchString+"' end result: "+sourceHits.size()+" source hits, "+witnessHits.size()+" witness hits"); JsonObject json = new JsonObject(); Gson gson = new Gson(); JsonArray jsonSrcs = hitsToJson( sourceHits, gson ); JsonArray jsonWits = hitsToJson( witnessHits, gson ); json.add("sources", jsonSrcs); json.add("witnesses", jsonWits); return toTextRepresentation( json.toString() ); } catch (IOException e) { setStatus(Status.SERVER_ERROR_INTERNAL); LOG.error("Search failed", e); return toTextRepresentation("Search Failed"); } catch (ParseException e) { setStatus(Status.CLIENT_ERROR_BAD_REQUEST); LOG.error("Invalid search query specified"); return toTextRepresentation("Invalid search query specified"); } finally { try { this.searcher.close(); } catch (IOException e) {} } } private void getSourceFragments(Map<HitItem, List<HitDetail>> hits) { List<HitItem> deadHit = new ArrayList<Searcher.HitItem>(); for ( Entry<HitItem, List<HitDetail>> ent : hits.entrySet() ) { List<HitDetail> ranges = ent.getValue(); Long srcId = Long.parseLong(ent.getKey().id); Source src = this.sourceDao.find(this.workspace.getId(), srcId); if ( src == null) { LOG.warn("Source "+srcId+" no longer exists"); deadHit.add(ent.getKey()); continue; } for ( Iterator<HitDetail> itr = ranges.iterator(); itr.hasNext();) { Reader srcReader = this.sourceDao.getContentReader(src); HitDetail detail = itr.next(); float p = (float)detail.getStartOffset() / (float)src.getText().getLength(); detail.percent = Math.round( p * 100.0f); int start = detail.getStartOffset()-this.fragSize; start = Math.max(0, start); int end = detail.getEndOffset()+this.fragSize; end = Math.min(end, (int)src.getText().getLength()); char[] buf = new char[end-start]; try { srcReader.skip(start); srcReader.read(buf, 0, buf.length); detail.fragment = new String( buf ).trim(); if ( start > 0 ) { detail.fragment = "..."+detail.fragment; } if ( end < src.getText().getLength() ) { detail.fragment += "..."; } } catch (IOException e) { LOG.error("Unable to get fragment for "+src+" range: "+start+", "+end); } } } // clean out dead stuff from search results for ( HitItem hi : deadHit) { hits.remove(hi); } } private void getWitnessFragments(Map<HitItem, List<HitDetail>> hits) { List<HitItem> deadHit = new ArrayList<Searcher.HitItem>(); for ( Entry<HitItem, List<HitDetail>> ent : hits.entrySet() ) { List<HitDetail> ranges = ent.getValue(); Long witId = Long.parseLong(ent.getKey().id); Witness wit = this.witnessDao.find(witId); if ( wit == null ) { LOG.warn("Witness "+witId+" no longer exists"); deadHit.add(ent.getKey()); continue; } for ( Iterator<HitDetail> itr = ranges.iterator(); itr.hasNext();) { Reader rdr = this.witnessDao.getContentStream(wit); HitDetail detail = itr.next(); float p = (float)detail.getStartOffset() / (float)wit.getText().getLength(); detail.percent = Math.round( p * 100.0f); int start = detail.getStartOffset()-this.fragSize; start = Math.max(0, start); int end = detail.getEndOffset()+this.fragSize; end = Math.min(end, (int)wit.getText().getLength()); char[] buf = new char[end-start]; try { rdr.skip(start); rdr.read(buf, 0, buf.length); detail.fragment = new String( buf ).trim(); if ( start > 0 ) { detail.fragment = "..."+detail.fragment; } if ( end < wit.getText().getLength() ) { detail.fragment += "..."; } } catch (IOException e) { LOG.error("Unable to get fragment for "+wit+" range: "+start+", "+end); } } } // clean out dead stuff from search results for ( HitItem hi : deadHit) { hits.remove(hi); } } private void mergeHits(Map<HitItem, List<HitDetail>> hits ) { for ( Entry<HitItem, List<HitDetail>> ent : hits.entrySet() ) { List<HitDetail> ranges = ent.getValue(); Collections.sort(ranges, new Comparator<HitDetail>() { @Override public int compare(HitDetail a, HitDetail b) { if ( a.getStartOffset() < b.getStartOffset() ) { return -1; } else if ( a.getStartOffset() > b.getStartOffset() ) { return 1; } else { if ( a.getEndOffset() < b.getEndOffset() ) { return -1; } else if ( a.getEndOffset() > b.getEndOffset() ) { return 1; } } return 0; } }); // merge adjacent into a single range TermVectorOffsetInfo lastRange = null; for ( Iterator<HitDetail> itr = ranges.iterator(); itr.hasNext();) { TermVectorOffsetInfo currRange = itr.next(); if ( lastRange != null ) { if ( lastRange.getEndOffset()+this.phraseSlop >= currRange.getStartOffset() ) { lastRange.setEndOffset( currRange.getEndOffset() ); itr.remove(); continue; } } lastRange = currRange; } // toss anything thats not the same len as the search str for ( Iterator<HitDetail> itr = ranges.iterator(); itr.hasNext();) { TermVectorOffsetInfo currRange = itr.next(); int len = currRange.getEndOffset() - currRange.getStartOffset(); if ( len < this.searchString.length() ) { itr.remove(); } } } for ( Iterator<Entry<HitItem, List<HitDetail>>> itr = hits.entrySet().iterator(); itr.hasNext();) { Entry<HitItem, List<HitDetail>> ent = itr.next(); if ( ent.getValue().size() == 0) { itr.remove(); } } } private JsonArray hitsToJson(Map<HitItem, List<HitDetail>> hits, Gson gson) { JsonArray jsonArray = new JsonArray(); for (Entry<HitItem, List<HitDetail>> ent : hits.entrySet() ) { HitItem info = ent.getKey(); JsonObject obj = new JsonObject(); obj.addProperty("id", info.id); obj.addProperty("name", info.name); JsonArray jsonHits = new JsonArray(); for (HitDetail detail : ent.getValue() ) { JsonObject ob = new JsonObject(); ob.addProperty("start", Integer.toString(detail.getStartOffset())); ob.addProperty("end", Integer.toString(detail.getEndOffset())); ob.addProperty("percent", detail.percent); ob.addProperty("fragment", detail.fragment); jsonHits.add(ob); } obj.add("hits", jsonHits); jsonArray.add(obj); } return jsonArray; } private int addHit(Map<HitItem, List<HitDetail>> hitMap, String itemId, String name, TermVectorOffsetInfo range) { HitItem hit = new HitItem(itemId, name); if ( hitMap.containsKey(hit) == false) { hitMap.put(hit, new ArrayList<HitDetail>() ); } hitMap.get(hit).add( new HitDetail(range) ); return hitMap.get(hit).size(); } @SuppressWarnings("serial") private static class HitDetail extends TermVectorOffsetInfo { public String fragment; public int percent; public HitDetail ( TermVectorOffsetInfo inf ) { super(); this.setEndOffset(inf.getEndOffset()); this.setStartOffset(inf.getStartOffset()); this.fragment = ""; this.percent = 0; } } private static class HitItem { public final String name; public final String id; public HitItem( String id, String name) { this.id = id; this.name = name; } @Override public String toString() { return "HitInfo ["+this.id+" "+this.name+"]"; } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + ((id == null) ? 0 : id.hashCode()); result = prime * result + ((name == null) ? 0 : name.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; HitItem other = (HitItem) obj; if (id == null) { if (other.id != null) return false; } else if (!id.equals(other.id)) return false; if (name == null) { if (other.name != null) return false; } else if (!name.equals(other.name)) return false; return true; } } }