package net.yacy.search.snippet; import java.net.MalformedURLException; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.CommonPattern; import net.yacy.kelondro.data.meta.URIMetadataNode; import net.yacy.search.query.QueryGoal; import net.yacy.search.query.SearchEvent; import net.yacy.search.schema.CollectionSchema; import org.apache.solr.common.SolrDocument; import static org.junit.Assert.*; import org.junit.Before; import org.junit.Test; public class TextSnippetTest { // declare some required parameter final CacheStrategy cacheStrategy = CacheStrategy.CACHEONLY; final boolean pre = true; final int snippetMaxLength = SearchEvent.SNIPPET_MAX_LENGTH; final boolean reindexing = false; SolrDocument doc; public TextSnippetTest() { } @Before public void setUp() throws Exception { // prepare a empty test document doc = new SolrDocument(); DigestURL url = new DigestURL("http://localhost/page.html"); doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash())); doc.addField(CollectionSchema.sku.name(), url.toNormalform(false)); // for testcases add other fields // fields involved in snippet extraction: // url, title, keywords, author, text_t } @Test public void testTextSnippet() throws MalformedURLException { URIMetadataNode testpage = new URIMetadataNode(doc); testpage.addField(CollectionSchema.title.name(), "New test case"); testpage.addField(CollectionSchema.keywords.name(), "junit"); testpage.addField(CollectionSchema.author.name(), "test author"); testpage.addField(CollectionSchema.text_t.name(), "A new testcase has been introduced. " + "It includes a few test lines and one line that should match."); String querywords = "testcase line"; QueryGoal qg = new QueryGoal(querywords); HandleSet queryhashes = qg.getIncludeHashes(); TextSnippet ts = new TextSnippet( null, testpage, queryhashes, cacheStrategy, pre, snippetMaxLength, reindexing ); String rstr = ts.getError(); assertEquals("testTextSnippet Error Code: ", "", rstr); String[] wordlist = CommonPattern.SPACE.split(querywords); rstr = ts.toString(); System.out.println("testTextSnippet: query=" + querywords); System.out.println("testTextSnippet: snippet=" + rstr); // check words included in snippet for (String word : wordlist) { assertTrue("testTextSnippet word included " + word, rstr.contains(word)); } } /** * Test of getLineMarked method, of class TextSnippet. */ @Test public void testGetLineMarked() throws MalformedURLException { URIMetadataNode testpage = new URIMetadataNode(doc); testpage.addField(CollectionSchema.title.name(), "New test case"); testpage.addField(CollectionSchema.keywords.name(), "junit"); testpage.addField(CollectionSchema.author.name(), "test author"); testpage.addField(CollectionSchema.text_t.name(), "A new testcase has been introduced. " + "It includes a few test lines and one line that should match."); String querywords = "testcase line"; QueryGoal qg = new QueryGoal(querywords); HandleSet queryhashes = qg.getIncludeHashes(); TextSnippet ts = new TextSnippet( null, testpage, queryhashes, cacheStrategy, pre, snippetMaxLength, reindexing ); String rstr = ts.getError(); assertEquals("testGetLineMarked Error Code: ", "", rstr); // check words marked in snippet rstr = ts.getLineMarked(qg); System.out.println("testGetLineMarked: query=" + querywords); System.out.println("testGetLineMarked: snippet=" + rstr); String[] wordlist = CommonPattern.SPACE.split(querywords); for (String wordstr : wordlist) { assertTrue("testGetLineMarked marked word " + wordstr, rstr.contains("<b>" + wordstr + "</b>")); } } /** * Test of descriptionline method, of class TextSnippet. * checking poper encoding of remaining html in raw snippet line. */ @Test public void testDescriptionline() throws MalformedURLException { String rawtestline = "Über großer test case </span> <pre> <hr><hr /></pre>"; // test line with html, risk of snippet format issue DigestURL url = new DigestURL("http://localhost/page.html"); QueryGoal qg = new QueryGoal("test"); // test with raw line (no marking added by YaCy) TextSnippet ts = new TextSnippet( url.hash(), rawtestline, true, // isMarked, TextSnippet.ResultClass.SOURCE_METADATA, ""); String sniptxt = ts.descriptionline(qg); // snippet text for display System.out.println("testDescriptionline: snippet=" + sniptxt); assertFalse ("HTML code not allowed in snippet text",sniptxt.contains("<pre>")); // display text not to include unwanted html // test with marking of query word ts = new TextSnippet( url.hash(), rawtestline, false, // isMarked, TextSnippet.ResultClass.SOURCE_METADATA, ""); sniptxt = ts.descriptionline(qg); System.out.println("testDescriptionline: snippet=" + sniptxt); assertFalse ("HTML code not allowed in snippet text",sniptxt.contains("<pre>")); // display text not to include unwanted html assertTrue ("Query word not marked", sniptxt.contains("<b>test</b>")); // query word to be marked // test text with some numbers (english/german format) rawtestline = "Test Version 1.83 calculates pi to 3,14 always"; ts = new TextSnippet( url.hash(), rawtestline, false, // isMarked, TextSnippet.ResultClass.SOURCE_METADATA, ""); sniptxt = ts.descriptionline(qg); System.out.println("testDescriptionline: (with numbers) snippet="+sniptxt); assertTrue ("number (.) broken up",sniptxt.contains("1.83")); assertTrue ("number (,) broken up",sniptxt.contains("3,14")); } }