package net.yacy.search.index;
import java.io.File;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Map;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Tokenizer;
import net.yacy.document.VocabularyScraper;
import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.data.word.WordReference;
import net.yacy.kelondro.data.word.WordReferenceRow;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.rwi.ReferenceFactory;
import net.yacy.kelondro.rwi.TermSearch;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.search.query.QueryGoal;
import org.junit.AfterClass;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import org.junit.BeforeClass;
import org.junit.Test;
public class SegmentTest {
static Segment index;
/**
* Setup RWI index
*
* @throws IOException
*/
@BeforeClass
public static void setUpClass() throws IOException {
// setup a index segment
index = new Segment(new ConcurrentLog("SegmentTest"),
new File("test/DATA/INDEX/webportal/SEGMENTS"),
new File("test/DATA/INDEX/webportal/ARCHIVE"),
null, null);
// connect RWI index
index.connectRWI(10, 1024);
}
@AfterClass
public static void tearDownClass() {
index.close();
ConcurrentLog.shutdown();
}
/**
* Test of clear method (for RWI), of class Segment.
*/
@Test
public void testClear() throws MalformedURLException, IOException, SpaceExceededException {
DigestURL url = new DigestURL("http://test.org/test.html");
int urlComps = MultiProtocolURL.urlComps(url.toNormalform(true)).length;
int urlLength = url.toNormalform(true).length();
byte[] termHash = Word.word2hash("test");
Word word = new Word(1, 1, 1);
word.flags = new Bitfield(4); // flags must not be null
WordReferenceRow ientry = new WordReferenceRow(
url.hash(), urlLength, urlComps, 0, 1, 1,
System.currentTimeMillis(), System.currentTimeMillis(),
UTF8.getBytes("en"), Response.DT_TEXT, 0, 0);
ientry.setWord(word);
// add a dummy Word and WordReference
index.termIndex.add(termHash, ientry);
// check index count
long cnt = index.RWICount();
assertTrue(cnt > 0);
index.clear();
// check index count after clear
cnt = index.RWICount();
assertTrue(cnt == 0);
}
/**
* Helper to store a text to the rwi index. This was derived from the
* Segment.storeDocument() procedure.
*
* @param text of the document
* @throws IOException
* @throws SpaceExceededException
*/
private void storeTestDocTextToTermIndex(DigestURL url, String text) throws IOException, SpaceExceededException {
// set a pseudo url for the simulated test document
final String urlNormalform = url.toNormalform(true);
String dc_title = "Test Document";
// STORE PAGE INDEX INTO WORD INDEX DB
// create a word prototype which is re-used for all entries
if (index.termIndex != null) {
final int outlinksSame = 0;
final int outlinksOther = 0;
final int urlLength = urlNormalform.length();
final int urlComps = MultiProtocolURL.urlComps(url.toNormalform(false)).length;
final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
WordCache meaningLib = new WordCache(null);
boolean doAutotagging = false;
VocabularyScraper scraper = null;
Tokenizer t = new Tokenizer(url, text, meaningLib, doAutotagging, scraper);
// create a WordReference template
final WordReferenceRow ientry = new WordReferenceRow(
url.hash(), urlLength, urlComps, wordsintitle,
t.RESULT_NUMB_WORDS, t.RESULT_NUMB_SENTENCES,
System.currentTimeMillis(), System.currentTimeMillis(),
UTF8.getBytes("en"), Response.DT_TEXT,
outlinksSame, outlinksOther);
// add the words to rwi index
Word wprop = null;
byte[] wordhash;
String word;
for (Map.Entry<String, Word> wentry : t.words().entrySet()) {
word = wentry.getKey();
wprop = wentry.getValue();
assert (wprop.flags != null);
ientry.setWord(wprop);
wordhash = Word.word2hash(word);
if (this.index != null) {
index.termIndex.add(wordhash, ientry);
}
}
}
}
/**
* Simulates a multi word query for the rwi termIndex
*
* @throws SpaceExceededException
* @throws MalformedURLException
* @throws IOException
*/
@Test
public void testQuery_MultiWordQuery() throws SpaceExceededException, MalformedURLException, IOException {
// creates one test url with this text in the rwi index
DigestURL url = new DigestURL("http://test.org/test.html");
storeTestDocTextToTermIndex(url, "One Two Three Four Five. This is a test text. One two three for five");
// posintext 1 2 3 4 5 6 7 8 9
// hitcount ("five") 1 1 2
// posofphrase |-------100------------| |------101---------| |--------102----------|
// posinphrase 1 2 3 4 5 1 2 3 4 1 2 3 4 5
// create a query to get the search word hashsets
QueryGoal qg = new QueryGoal("five test ");
HandleSet queryHashes = qg.getIncludeHashes();
HandleSet excludeHashes = qg.getExcludeHashes();
HandleSet urlselection = null;
ReferenceFactory<WordReference> termFactory = Segment.wordReferenceFactory;
// do the search
TermSearch<WordReference> result = index.termIndex.query(queryHashes, excludeHashes, urlselection, termFactory, Integer.MAX_VALUE);
// get the joined results
ReferenceContainer<WordReference> wc = result.joined();
// we should have now one result (stored to index above)
assertTrue("test url hash in result set", wc.has(url.hash()));
// the returned WordReference is expected to be a joined Reference with properties set used in ranking
WordReference r = wc.getReference(url.hash());
// min position of search word in text (posintext)
assertEquals("min posintext('five')", 5, r.posintext());
// occurence of search words in text
assertEquals("hitcount('five')", 2, r.hitcount());
// phrase counts
assertEquals("phrasesintext", 3, r.phrasesintext());
assertEquals("posofphrase", 100, r.posofphrase());
assertEquals("posinphrase", 5, r.posinphrase());
}
}