TextSnippetTest.java example

Explorer
yacy_search_server-master

package net.yacy.search.snippet;

import java.net.MalformedURLException;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.search.query.QueryGoal;
import net.yacy.search.query.SearchEvent;
import net.yacy.search.schema.CollectionSchema;
import org.apache.solr.common.SolrDocument;
import static org.junit.Assert.*;
import org.junit.Before;
import org.junit.Test;


public class TextSnippetTest {

    // declare some required parameter
    final CacheStrategy cacheStrategy = CacheStrategy.CACHEONLY;
    final boolean pre = true;
    final int snippetMaxLength = SearchEvent.SNIPPET_MAX_LENGTH;
    final boolean reindexing = false;

    SolrDocument doc;

    public TextSnippetTest() {
    }

    @Before
    public void setUp() throws Exception {

        // prepare a empty test document
        doc = new SolrDocument();
        DigestURL url = new DigestURL("http://localhost/page.html");
        doc.addField(CollectionSchema.id.name(), ASCII.String(url.hash()));
        doc.addField(CollectionSchema.sku.name(), url.toNormalform(false));
        // for testcases add other fields
        // fields involved in snippet extraction:
        // url, title, keywords, author, text_t
    }

    @Test
    public void testTextSnippet() throws MalformedURLException {

        URIMetadataNode testpage = new URIMetadataNode(doc);
        testpage.addField(CollectionSchema.title.name(), "New test case");
        testpage.addField(CollectionSchema.keywords.name(), "junit");
        testpage.addField(CollectionSchema.author.name(), "test author");
        testpage.addField(CollectionSchema.text_t.name(), "A new testcase has been introduced. "
                + "It includes a few test lines and one line that should match.");

        String querywords = "testcase line";
        QueryGoal qg = new QueryGoal(querywords);
        HandleSet queryhashes = qg.getIncludeHashes();

        TextSnippet ts = new TextSnippet(
                null,
                testpage,
                queryhashes,
                cacheStrategy,
                pre,
                snippetMaxLength,
                reindexing
        );
        String rstr = ts.getError();
        assertEquals("testTextSnippet Error Code: ", "", rstr);

        String[] wordlist = CommonPattern.SPACE.split(querywords);
        rstr = ts.toString();
        System.out.println("testTextSnippet: query=" + querywords);
        System.out.println("testTextSnippet: snippet=" + rstr);
        // check words included in snippet
        for (String word : wordlist) {
            assertTrue("testTextSnippet word included " + word, rstr.contains(word));
        }

    }

    /**
     * Test of getLineMarked method, of class TextSnippet.
     */
    @Test
    public void testGetLineMarked() throws MalformedURLException {
        URIMetadataNode testpage = new URIMetadataNode(doc);
        testpage.addField(CollectionSchema.title.name(), "New test case");
        testpage.addField(CollectionSchema.keywords.name(), "junit");
        testpage.addField(CollectionSchema.author.name(), "test author");
        testpage.addField(CollectionSchema.text_t.name(),
                "A new testcase has been introduced. "
                + "It includes a few test lines and one line that should match.");

        String querywords = "testcase line";
        QueryGoal qg = new QueryGoal(querywords);
        HandleSet queryhashes = qg.getIncludeHashes();

        TextSnippet ts = new TextSnippet(
                null,
                testpage,
                queryhashes,
                cacheStrategy,
                pre,
                snippetMaxLength,
                reindexing
        );

        String rstr = ts.getError();
        assertEquals("testGetLineMarked Error Code: ", "", rstr);

        // check words marked in snippet
        rstr = ts.getLineMarked(qg);
        System.out.println("testGetLineMarked: query=" + querywords);
        System.out.println("testGetLineMarked: snippet=" + rstr);
        String[] wordlist = CommonPattern.SPACE.split(querywords);
        for (String wordstr : wordlist) {
            assertTrue("testGetLineMarked marked word " + wordstr, rstr.contains("<b>" + wordstr + "</b>"));
        }
    }

    /**
     * Test of descriptionline method, of class TextSnippet.
     * checking poper encoding of remaining html in raw snippet line.
     */
    @Test
    public void testDescriptionline() throws MalformedURLException {
        String rawtestline = "Über großer test case </span> <pre> <hr><hr /></pre>"; // test line with html, risk of snippet format issue

        DigestURL url = new DigestURL("http://localhost/page.html");
        QueryGoal qg = new QueryGoal("test");

        // test with raw line (no marking added by YaCy)
        TextSnippet ts = new TextSnippet(
            url.hash(),
            rawtestline,
            true, // isMarked,
            TextSnippet.ResultClass.SOURCE_METADATA, "");

        String sniptxt = ts.descriptionline(qg); // snippet text for display
        System.out.println("testDescriptionline: snippet=" + sniptxt);
        assertFalse ("HTML code not allowed in snippet text",sniptxt.contains("<pre>")); // display text not to include unwanted html

        // test with marking of query word
         ts = new TextSnippet(
            url.hash(),
            rawtestline,
            false, // isMarked,
            TextSnippet.ResultClass.SOURCE_METADATA, "");

        sniptxt = ts.descriptionline(qg);
        System.out.println("testDescriptionline: snippet=" + sniptxt);
        assertFalse ("HTML code not allowed in snippet text",sniptxt.contains("<pre>")); // display text not to include unwanted html
        assertTrue ("Query word not marked", sniptxt.contains("<b>test</b>")); // query word to be marked

        // test text with some numbers (english/german format)
        rawtestline = "Test Version 1.83 calculates pi to 3,14 always";
        ts = new TextSnippet(
            url.hash(),
            rawtestline,
            false, // isMarked,
            TextSnippet.ResultClass.SOURCE_METADATA, "");
        sniptxt = ts.descriptionline(qg);
        System.out.println("testDescriptionline: (with numbers) snippet="+sniptxt);
        assertTrue ("number (.) broken up",sniptxt.contains("1.83"));
        assertTrue ("number (,) broken up",sniptxt.contains("3,14"));
    }
}