package com.trsst.server; import java.io.File; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.io.StringReader; import java.util.Date; import java.util.HashSet; import java.util.LinkedList; import java.util.List; import java.util.Set; import javax.xml.namespace.QName; import org.apache.abdera.Abdera; import org.apache.abdera.i18n.iri.IRI; import org.apache.abdera.model.Category; import org.apache.abdera.model.Element; import org.apache.abdera.model.Entry; import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.LongField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.AtomicReaderContext; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.IndexWriterConfig.OpenMode; import org.apache.lucene.index.Term; import org.apache.lucene.queryparser.flexible.core.QueryNodeException; import org.apache.lucene.queryparser.flexible.standard.StandardQueryParser; import org.apache.lucene.queryparser.flexible.standard.config.StandardQueryConfigHandler; import org.apache.lucene.queryparser.flexible.standard.parser.ParseException; import org.apache.lucene.search.Collector; import org.apache.lucene.search.Filter; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.NumericRangeFilter; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.Sort; import org.apache.lucene.search.SortField; import org.apache.lucene.search.TopDocs; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import org.w3c.dom.Node; import org.w3c.dom.NodeList; import org.w3c.dom.Text; import org.w3c.tidy.Tidy; import com.trsst.Common; import com.trsst.client.Client; /** * Manages storage and indexing of feed and entry documents, and delegates * storage of resources to another Storage instance. * * @author mpowers */ public class LuceneStorage implements Storage { /** * Shared abdera instance. */ private Abdera abdera; /** * Deletable storage delegate: used for caching feeds fetched from other * servers. Basically, if this storage went away, it would be no big deal. */ private Storage cacheStorage; /** * Persistent storage delegate: used for feeds managed by this server. This * is basically the user's primary backup of all entries created. */ private Storage persistentStorage; /* * Lucene readers/writers are thread-safe and shared instances are * recommended. */ private IndexWriter writer; private IndexReader reader; private Analyzer analyzer; /** * Default constructor manages individual feed, entry, and resource * documents with a FileStorage. * * @throws IOException */ public LuceneStorage() throws IOException { this(new FileStorage()); } /** * Manages index and calls to the specified storage delegate to handle * individual feed, entry, and resource persistence. * * @param delegate * @throws IOException */ public LuceneStorage(Storage delegate) throws IOException { this(delegate, null); } /** * Manages index and calls to the specified storage delegate to handle * individual feed, entry, and resource persistence. Any feeds managed by * this server will call to persistent storage rather than cache storage. * * @param delegate * @throws IOException */ public LuceneStorage(Storage cache, Storage persistent) throws IOException { cacheStorage = cache; persistentStorage = persistent; abdera = Abdera.getInstance(); Directory dir = FSDirectory.open(new File(Common.getServerRoot(), "entry.idx")); analyzer = new StandardAnalyzer(Version.LUCENE_46); IndexWriterConfig iwc = new IndexWriterConfig(Version.LUCENE_46, analyzer); iwc.setOpenMode(OpenMode.CREATE_OR_APPEND); writer = new IndexWriter(dir, iwc); writer.commit(); refreshReader(); } private void refreshReader() throws IOException { reader = DirectoryReader.open(writer, true); } /** * Returns feed ids with content hosted on this server. Feeds must be * ordered by most recent update. * * @param start * the start index from which to return results; if exceeds * bounds of available results, zero results are returned. * @param length * the maximum number of results to return; servers may return * fewer results. * @return the specified feed ids hosted on this server. */ public String[] getFeedIds(int start, int length) { return persistentStorage.getFeedIds(start, length); } private boolean isManaged(String feedId) { String[] feedIds = getFeedIds(0, 100); for (String id : feedIds) { if (id.equals(feedId)) { return true; } } return false; } private Storage getStorage(String feedId) { if (persistentStorage == null) { return cacheStorage; } if (isManaged(feedId)) { return persistentStorage; } return cacheStorage; } /** * Returns categories mentioned in content hosted on this server. Categories * should be ordered by most popular or recently used, or a combination of * both ("trending"). * * @param start * the start index from which to return results; if exceeds * bounds of available results, zero results are returned. * @param length * the maximum number of results to return; servers may return * fewer results. * @return the specified trending categories. */ public String[] getCategories(int start, int length) { // TODO: implement category tracking // return most frequent categories for the past 100 or 1000 entries return new String[0]; } public int getEntryCount(Date after, Date before, String query, String[] mentions, String[] tags, String verb) { return getEntryCountForFeedId(null, after, before, query, mentions, tags, verb); } public int getEntryCountForFeedId(String feedId, Date after, Date before, String search, String[] mentions, String[] tags, String verb) { try { Filter filter = buildRangeFilter(after, before); Query query = buildTextQuery(feedId, search, mentions, tags, verb); CountCollector collector = new CountCollector(); new IndexSearcher(reader).search(query, filter, collector); return collector.getCount(); } catch (IOException e) { log.error("Unexpected error getting entry count for feed: " + feedId, e); } catch (QueryNodeException e) { log.error("Unexpected error executing count query for feed: " + feedId, e); } return -1; } public String[] getEntryIds(int start, int length, Date after, Date before, String query, String[] mentions, String[] tags, String verb) { return _getEntryIdsForFeedId(null, start, length, after, before, query, mentions, tags, verb); } public long[] getEntryIdsForFeedId(String feedId, int start, int length, Date after, Date before, String query, String[] mentions, String[] tags, String verb) { String[] ids = _getEntryIdsForFeedId(feedId, start, length, after, before, query, mentions, tags, verb); long[] result = null; if (ids != null) { result = new long[ids.length]; int i = 0; int offset = feedId.length() + 1; // entry keys contain feed id for (String id : ids) { result[i++] = Long.parseLong(id.substring(offset), 16); } } return result; } private String[] _getEntryIdsForFeedId(String feedId, int start, int length, Date after, Date before, String search, String[] mentions, String[] tags, String verb) { try { Filter filter = buildRangeFilter(after, before); Query query = buildTextQuery(feedId, search, mentions, tags, verb); TopDocs hits = new IndexSearcher(reader).search(query, filter, start + length, new Sort(new SortField("updated", SortField.Type.LONG, true))); String[] result = new String[Math.min(length, hits.totalHits)]; int i = 0; String id; int replace; Set<String> fields = new HashSet<String>(); fields.add("entry"); // we only need the entry field for (ScoreDoc e : hits.scoreDocs) { id = new IndexSearcher(reader).doc(e.doc).get("entry"); replace = id.lastIndexOf('-'); if (replace != -1) { id = id.substring(0, replace) + ':' + id.substring(replace + 1); } result[i++] = id; } return result; } catch (IOException e) { log.error("Unexpected error getting query for feed: " + feedId, e); } catch (QueryNodeException e) { log.error("Unexpected error executing query for feed: " + feedId, e); } return null; } private Filter buildRangeFilter(Date after, Date before) { if (after == null && before == null) { return null; } long afterTime; if (after != null) { afterTime = after.getTime(); } else { // all entries are after this key afterTime = Long.MIN_VALUE; } long beforeTime; if (before != null) { beforeTime = before.getTime(); } else { // all entries are before this key beforeTime = Long.MAX_VALUE; } return NumericRangeFilter.newLongRange("updated", afterTime, beforeTime, false, false); } private Query buildTextQuery(String feedId, String search, String[] mentions, String[] tags, String verb) throws QueryNodeException { if (search == null) { search = ""; } // feedId = "M9Dvwqp4GcRJe6gh7p73bCcQk8dKLG19z"; // search = "feed:\"HSzp9eneHcqsp4Vdt9pMfP1Qy83FZZwmE\""; if (verb != null) { search = search + " verb:" + verb; } if (tags != null) { for (String tag : tags) { tag = tag.trim(); if (tag.startsWith(Common.FEED_URN_PREFIX)) { tag = tag.substring(Common.FEED_URN_PREFIX.length()); } if (tag.startsWith(Common.ENTRY_URN_PREFIX)) { tag = tag.substring(Common.ENTRY_URN_PREFIX.length()); } search = search + " tag:\"" + tag.toLowerCase() + "\""; } } if (mentions != null) { for (String mention : mentions) { mention = mention.trim(); if (mention.startsWith(Common.ACCOUNT_URN_PREFIX)) { int index = mention.indexOf(Common.ACCOUNT_URN_FEED_PREFIX); if (index != -1) { // feed id instead String id = mention.substring(index + Common.ACCOUNT_URN_FEED_PREFIX.length()); search = search + " tag:\"" + id + "\""; // truncate feed id and continue mention = mention.substring(0, index); } mention = mention.substring(Common.ACCOUNT_URN_PREFIX .length()); } if (mention.startsWith(Common.FEED_URN_PREFIX)) { mention = mention .substring(Common.FEED_URN_PREFIX.length()); } if (mention.startsWith(Common.ENTRY_URN_PREFIX)) { mention = mention.substring(Common.ENTRY_URN_PREFIX .length()); } // mentions treated as tags in index search = search + " tag:\"" + mention + "\""; } } if (feedId != null) { search = "feed:\"" + feedId + "\"" + search; } if (search.trim().length() == 0) { log.trace("No search parameters: " + search); search = "*"; // return everything } StandardQueryParser parser = new StandardQueryParser(); parser.setDefaultOperator(StandardQueryConfigHandler.Operator.AND); try { return parser.parse(search, "text"); } catch (ParseException se) { log.error("Could not parse query: " + search); throw se; } } /** * Returns the contents of the unmodified feed element which was previously * passed to updateFeed for the specified feed; otherwise throws * FileNotFoundException. * * @param feedId * the specified feed. * @return a signed feed element and child elements but excluding entry * elements. * @throws FileNotFoundException * if the specified feed does not exist on this server. * @throws IOException * if an error occurs obtaining the entry data. */ public String readFeed(String feedId) throws FileNotFoundException, IOException { return getStorage(feedId).readFeed(feedId); } /** * Receives the contents of a signed feed element to be stored and * associated with the specified feed. The retured string contains a signed * feed element and holds all meta-data attributes associated with the feed. * These contents may be inspected, analyzed, and indexed, but must be * returned unmodifed to callers of readFeed() so the signature remains * intact. Note that the feed element DOES NOT contain entry elements. * * @param feedId * the specified feed. * @param lastUpdated * the datetime when this feed says it was last updated; used for * time range queries * @param feed * the contents to be persisted. * @throws IOException * if a error occurs persisting the entry data. */ public void updateFeed(String feedId, Date lastUpdated, String content) throws IOException { try { // NOTE: not yet sure if we want to index feeds // as we haven't exposed a way to search them // Feed feed = (Feed) abdera.getParser() // .parse(new StringReader(content)).getRoot(); // if (feed.getTitle() != null) { // Document document = new Document(); // document.add(new StringField("feed", feedId, Field.Store.NO)); // document.add(new TextField("title", feed.getTitle(), // Field.Store.NO)); // document.add(new TextField("subtitle", feed.getSubtitle(), // Field.Store.NO)); // Person author = feed.getAuthor(); // if (author != null) { // if (author.getName() != null) { // document.add(new TextField("name", author.getName(), // Field.Store.NO)); // } // if (author.getEmail() != null) { // document.add(new StringField("address", author // .getEmail(), Field.Store.NO)); // } // } // getStorage(feedId).updateFeed(feedId, lastUpdated, content); // feedWriter.updateDocument(new Term("feed", feedId), document); // } } catch (Throwable t) { log.error( "Error from update feed: " + feedId + " : " + lastUpdated, t); throw new IOException("Could not parse input for: " + feedId + " : " + t.getMessage()); } } /** * Returns the contents of a signed entry element for the specified feed * which was previously passed to updateFeedEntry. * * @param feedId * the specified feed. * @param entryId * the desired entry for the specified feed. * @return a signed entry element. * @throws FileNotFoundException * if the specified entry does not exist. * @throws IOException * if a error occurs obtaining the entry data. */ public String readEntry(String feedId, long entryId) throws FileNotFoundException, IOException { return getStorage(feedId).readEntry(feedId, entryId); } /** * Receives the contents of a signed entry element to be stored and * associated with the specified feed and unique identifier for later * retrieval by readFeedEntry(). These contents may be inspected, analyzed, * and indexed, but must be returned unmodifed to callers of readEntry() so * the signature remains intact. * * @param feedId * the specified feed. * @param entryId * the unique identifier for the entry to be persisted. * @param publishDate * the datetime when this entry says it was or will be published; * used for date/time range queries * @param entry * an entry element whose contents are to be persisted. * @throws IOException * if a error occurs persisting the entry data. */ public void updateEntry(String feedId, long entryId, Date publishDate, String content) throws IOException { try { Entry entry = (Entry) abdera.getParser() .parse(new StringReader(content)).getRoot(); // we also accumulate categories, mentions, and verbs into // a single combined multivalue string index Set<String> tags = new HashSet<String>(); // get verb String verb = null; // "post" is default verb Element verbElement = entry.getExtension(new QName( "http://activitystrea.ms/spec/1.0/", "verb", "activity")); if (verbElement != null) { if (verbElement.getText() != null) { verb = verbElement.getText().trim().toLowerCase(); while (verb.length() > 0 && (verb.charAt(0) == '#' || verb.charAt(0) == '@')) { // strip our "special" characters verb = verb.substring(1); } } } if (verb == null || verb.length() == 0) { verb = "post"; // "post" is default verb } tags.add(verb); // get categories List<Category> categories = entry.getCategories(); if (categories != null) { for (Category e : categories) { IRI scheme = e.getScheme(); if (scheme != null && (Common.TAG_URN.equals(scheme.toString()) || Common.TAG_URN_LEGACY .equals(scheme.toString()))) { if (e.getTerm() != null) { tags.add('#' + e.getTerm().trim().toLowerCase()); } } else if (scheme != null && (Common.MENTION_URN.equals(scheme.toString()) || Common.MENTION_URN_LEGACY .equals(scheme.toString()))) { String mention = e.getTerm(); if (mention != null) { mention = mention.trim(); if (mention.startsWith(Common.ACCOUNT_URN_PREFIX)) { int index = mention .indexOf(Common.ACCOUNT_URN_FEED_PREFIX); if (index != -1) { // feed id instead String id = mention.substring(index + Common.ACCOUNT_URN_FEED_PREFIX .length()); tags.add('@' + id); // truncate feed id and continue mention = mention.substring(0, index); } mention = mention .substring(Common.ACCOUNT_URN_PREFIX .length()); } if (mention.startsWith(Common.FEED_URN_PREFIX)) { mention = mention .substring(Common.FEED_URN_PREFIX .length()); } if (mention.startsWith(Common.ENTRY_URN_PREFIX)) { mention = mention .substring(Common.ENTRY_URN_PREFIX .length()); } tags.add('@' + mention); } } } } // convert to list and persist List<String> converted = new LinkedList<String>(); for (String tag : tags) { converted.add(tag); } // extract fields for full-text search index Document document = new Document(); StringBuffer text = new StringBuffer(); document.add(new StringField("entry", getEntryKeyString(feedId, entryId), Field.Store.YES)); text.append(entryId).append(' '); document.add(new StringField("feed", feedId, Field.Store.NO)); text.append(feedId).append(' '); document.add(new StringField("verb", verb, Field.Store.NO)); text.append(verb).append(' '); document.add(new LongField("updated", entryId, Field.Store.NO)); text.append(verb).append(' '); if (entry.getTitle() != null) { String title = entry.getTitle().toLowerCase(); document.add(new TextField("title", title, Field.Store.NO)); text.append(title).append(' '); } if (entry.getSummary() != null) { String summary = extractTextFromHtml(entry.getSummary()) .toLowerCase(); // System.out.println("extracting: " + summary); document.add(new TextField("summary", summary, Field.Store.NO)); text.append(summary).append(' '); } tags.remove(verb); // don't treat verb as tag in full-text search for (String tag : tags) { tag = tag.substring(1); // remove @ or # document.add(new StringField("tag", tag, Field.Store.NO)); text.append(tag).append(' '); } document.add(new TextField("text", text.toString(), Field.Store.NO)); // persist the document getStorage(feedId).updateEntry(feedId, entryId, publishDate, content); writer.updateDocument( new Term("entry", getEntryKeyString(feedId, entryId)), document); writer.commit(); refreshReader(); } catch (Throwable t) { log.error("Error from update entry: " + feedId + " : " + entryId, t); throw new IOException("Could not parse input for: " + getEntryKeyString(feedId, entryId) + " : " + t.getMessage()); } } // borrowed from lai-xin-chu: http://stackoverflow.com/questions/12576119 private String extractTextFromHtml(String html) { Tidy tidy = new Tidy(); tidy.setQuiet(true); tidy.setShowWarnings(false); org.w3c.dom.Document root = tidy.parseDOM(new StringReader(html), null); return getText(root.getDocumentElement()); } // borrowed from lai-xin-chu: http://stackoverflow.com/questions/12576119 private String getText(Node node) { NodeList children = node.getChildNodes(); StringBuffer sb = new StringBuffer(); for (int i = 0; i < children.getLength(); i++) { Node child = children.item(i); switch (child.getNodeType()) { case Node.ELEMENT_NODE: sb.append(getText(child)); sb.append(" "); break; case Node.TEXT_NODE: sb.append(((Text) child).getData()); break; } } return sb.toString(); } /** * Delete an existing entry for the specified feed. * * @param feedId * the specified feed. * @param entryId * the desired entry for the specified feed. * @throws FileNotFoundException * if the specified entry does not exist. * @throws IOException * if a error occurs while deleting the entry data. */ public void deleteEntry(String feedId, long entryId) throws FileNotFoundException, IOException { try { writer.deleteDocuments(new Term("entry", getEntryKeyString(feedId, entryId))); writer.commit(); refreshReader(); } catch (Throwable t) { log.error("Unexpected error from delete entry: " + feedId + " : " + entryId, t); throw new IOException("Unexpected error while deleting: " + getEntryKeyString(feedId, entryId) + " : " + t.getMessage()); } getStorage(feedId).deleteEntry(feedId, entryId); } private static final String getEntryKeyString(String feedId, long entityId) { return feedId + '-' + Long.toHexString(entityId); } /** * Returns the mime-type of the contents of the resource data for the * specified entry for the specified feed, if known. If not known, returns * null. * * @param feedId * the specified feed. * @param entryId * the specified entry. * @param resourceId * the desired resource id for the specified feed and entry. * @return the mime type of the resource, or null if not known. * @throws FileNotFoundException * if the specified resource does not exist on this server. * @throws IOException * if a error occurs obtaining the resource data. */ public String readFeedEntryResourceType(String feedId, long entryId, String resourceId) throws FileNotFoundException, IOException { return getStorage(feedId).readFeedEntryResourceType(feedId, entryId, resourceId); } /** * Obtains an input stream to read the contents of the resource data for the * specified entry for the specified feed. Callers must close the input * stream when finished. * * @param feedId * the specified feed. * @param entryId * the specified entry. * @param resourceId * the desired resource id for the specified feed and entry. * @return an input stream to read the contents of the resource. * @throws FileNotFoundException * if the specified entry does not exist. * @throws IOException * if a error occurs obtaining the resource data. */ public InputStream readFeedEntryResource(String feedId, long entryId, String resourceId) throws FileNotFoundException, IOException { return getStorage(feedId).readFeedEntryResource(feedId, entryId, resourceId); } /** * Stores a binary resource for the specified feed and entry by reading the * specified input stream and persisting the contents for later retrieval by * readFeedEntryResource(). Implementors must close the input stream when * finished. * * @param feedId * the specified feed. * @param entryId * the specified entry. * @param resourceId * the desired resource id for the specified feed and entry. * @param mimeType * the mime type of the data if known, otherwise null. * @param publishDate * the datetime when the associated entry says it was or will be * published; used for date/time range queries * @param data * an input stream whose contents are to be persisted. * @throws IOException * if a error occurs persisting the resource data. */ public void updateFeedEntryResource(String feedId, long entryId, String resourceId, String mimeType, Date publishDate, byte[] data) throws IOException { getStorage(feedId).updateFeedEntryResource(feedId, entryId, resourceId, mimeType, publishDate, data); } /** * Delete an existing resource for the specified feed and entry. * * @param feedId * the specified feed. * @param entryId * the specified entry. * @param resourceId * the desired resource id for the specified feed and entry. * @throws IOException * if a error occurs while deleting the resource data. */ public void deleteFeedEntryResource(String feedId, long entryId, String resourceId) throws IOException { getStorage(feedId).deleteFeedEntryResource(feedId, entryId, resourceId); } private final static org.slf4j.Logger log = org.slf4j.LoggerFactory .getLogger(Client.class); private static class CountCollector extends Collector { int count; @Override public void setScorer(Scorer scorer) throws IOException { // ignore } @Override public void collect(int doc) throws IOException { count++; } @Override public void setNextReader(AtomicReaderContext context) throws IOException { // ignore } @Override public boolean acceptsDocsOutOfOrder() { return true; } public int getCount() { return count; } } }