package sample.save2dropbox.business; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.queryparser.classic.QueryParser; import org.apache.lucene.search.BooleanClause.Occur; import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.Version; import rfx.server.util.StringUtil; import sample.save2dropbox.model.Item; /** * @author trieu * why not Solr or Elastic Seach, we need consider (Jimfs is an in-memory file system for lucene's Directory) https://github.com/google/jimfs * */ public class SearchEngineLucene { static void addUserWithKeyword(IndexWriter w, Item item) throws IOException { Document doc = new Document(); doc.add(new TextField("keywords", item.getKeywordsAsString(), Field.Store.YES)); doc.add(new StringField("user_id", item.getUser_id()+"", Field.Store.YES)); doc.add(new StringField("title", item.getTitle(), Field.Store.YES)); doc.add(new StringField("post_id", item.getPost_id()+"", Field.Store.YES)); doc.add(new StringField("dp_link", item.getDp_link(), Field.Store.YES)); doc.add(new StringField("link", item.getLink(), Field.Store.YES)); w.addDocument(doc); } static Item documentToItem(Document doc){ int post_id = StringUtil.safeParseInt(doc.get("post_id")); String keywords = doc.get("keywords"); String title = doc.get("title"); String dp_link = doc.get("dp_link"); String link = doc.get("link"); int user_id = StringUtil.safeParseInt(doc.get("user_id")); return new Item(post_id, keywords, dp_link, title, link, user_id); } public static boolean indexItems(List<Item> items){ boolean create = false; Directory directory = null; try { // 0. Specify the analyzer for tokenizing text. // The same analyzer should be used for indexing and searching // 1. create the index File indexDirFile = new File("data/lucene-index"); if ( ! indexDirFile.exists() || !indexDirFile.isDirectory()) { create = indexDirFile.mkdir(); } else { create = true; } System.out.println(indexDirFile.getAbsolutePath()); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); // To store an index on disk directory = FSDirectory.open(indexDirFile); IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_47, analyzer); IndexWriter w = new IndexWriter(directory, config); for (Item item : items) { addUserWithKeyword(w, item); } w.commit(); w.close(); } catch (Exception e) { e.printStackTrace(); } finally { if(directory != null){ try { directory.close(); } catch (IOException e) {} } } return create; } public static List<Item> searchItemsByKeywords(List<String> keywords, int user_id){ List<Item> matchedItems = new ArrayList<>(); Directory directory = null; try { // 0. Specify the analyzer for tokenizing text. // The same analyzer should be used for indexing and searching // 1. the index boolean create = false; File indexDirFile = new File("data/lucene-index"); if ( ! indexDirFile.exists() || !indexDirFile.isDirectory()) { create = indexDirFile.mkdir(); } else { create = true; } System.out.println(indexDirFile.getAbsolutePath()); StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_47); // To store an index on disk directory = FSDirectory.open(indexDirFile); // 2. query StringBuilder querystr = new StringBuilder(); for (String keyword : keywords) { querystr.append("\"").append(keyword).append("\" "); } System.out.println(querystr); Query kq = new QueryParser(Version.LUCENE_47, "keywords", analyzer).parse(querystr.toString().trim()); BooleanQuery query = new BooleanQuery(); query.add(kq, Occur.MUST); if(user_id > 0){ //only recommend items that not owned by user Query uq = new QueryParser(Version.LUCENE_47, "user_id", analyzer).parse("\""+user_id+"\""); query.add(uq, Occur.MUST_NOT); } // 3. search int hitsPerPage = 10; IndexReader reader = DirectoryReader.open(directory); IndexSearcher searcher = new IndexSearcher(reader); TopScoreDocCollector collector = TopScoreDocCollector.create(hitsPerPage, true); searcher.search(query, collector); ScoreDoc[] hits = collector.topDocs().scoreDocs; // 4. display results System.out.println("Found " + hits.length + " hits."); for (int i = 0; i < hits.length; ++i) { int docId = hits[i].doc; Document d = searcher.doc(docId); matchedItems.add(documentToItem(d)); //System.out.println((i + 1) + ". " + d.get("keywords") + "\t"+ d.get("title")); } // is no need to access the documents any more. reader.close(); } catch (Exception e) { e.printStackTrace(); } finally { if(directory != null){ try { directory.close(); } catch (IOException e) {} } } return matchedItems; } public static void main(String[] args) { // indexItems(Arrays.asList( // new Item(5, "cloud computing, cloud storage", "http:/111", "item 5", "http:/111", 1) // ,new Item(6, "big data, cloud computing", "http:/111", "item 6", "http:/111", 1) // )); //indexItems(new ArrayList<Item>()); List<Item> items = searchItemsByKeywords(Arrays.asList("framework"), 0); for (Item item : items) { System.out.println(item); } } }