// WordReferenceRow.java // (C) 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 20.05.2006 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.kelondro.data.word; import java.util.Collection; import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.kelondro.index.Column; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.rwi.AbstractReference; import net.yacy.kelondro.rwi.Reference; import net.yacy.kelondro.util.Bitfield; /** * this object stores attributes to URL references inside RWI collections * */ public final class WordReferenceRow extends AbstractReference implements WordReference, Cloneable { public static final Row urlEntryRow = new Row(new Column[]{ new Column("h", Column.celltype_string, Column.encoder_bytes, Word.commonHashLength, "urlhash"), new Column("a", Column.celltype_cardinal, Column.encoder_b256, 2, "lastModified"), new Column("s", Column.celltype_cardinal, Column.encoder_b256, 2, "freshUntil"), // TODO: unused (since 2009) new Column("u", Column.celltype_cardinal, Column.encoder_b256, 1, "wordsInTitle"), new Column("w", Column.celltype_cardinal, Column.encoder_b256, 2, "wordsInText"), new Column("p", Column.celltype_cardinal, Column.encoder_b256, 2, "phrasesInText"), new Column("d", Column.celltype_binary, Column.encoder_bytes, 1, "doctype"), new Column("l", Column.celltype_string, Column.encoder_bytes, 2, "language"), new Column("x", Column.celltype_cardinal, Column.encoder_b256, 1, "llocal"), new Column("y", Column.celltype_cardinal, Column.encoder_b256, 1, "lother"), new Column("m", Column.celltype_cardinal, Column.encoder_b256, 1, "urlLength"), new Column("n", Column.celltype_cardinal, Column.encoder_b256, 1, "urlComps"), new Column("g", Column.celltype_binary, Column.encoder_bytes, 1, "typeofword"), new Column("z", Column.celltype_bitfield, Column.encoder_bytes, 4, "flags"), new Column("c", Column.celltype_cardinal, Column.encoder_b256, 1, "hitcount"), new Column("t", Column.celltype_cardinal, Column.encoder_b256, 2, "posintext"), new Column("r", Column.celltype_cardinal, Column.encoder_b256, 1, "posinphrase"), new Column("o", Column.celltype_cardinal, Column.encoder_b256, 1, "posofphrase"), new Column("i", Column.celltype_cardinal, Column.encoder_b256, 1, "worddistance"), // arbitrary column for avg distance of search query words new Column("k", Column.celltype_cardinal, Column.encoder_b256, 1, "reserve") }, Base64Order.enhancedCoder ); // available chars: b,e,j,q /** * object for termination of concurrent blocking queue processing */ protected static final Row.Entry poisonRowEntry = urlEntryRow.newEntry(); // static properties private static final int col_urlhash = 0; // h 12 the url hash b64-encoded private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short private static final int col_wordsInTitle = 3; // u 1 words in description/length (longer are better?) private static final int col_wordsInText = 4; // w 2 total number of words in document private static final int col_phrasesInText = 5; // p 2 total number of phrases in document private static final int col_doctype = 6; // d 1 type of document private static final int col_language = 7; // l 2 (guessed) language of document private static final int col_llocal = 8; // x 1 outlinks to same domain private static final int col_lother = 9; // y 1 outlinks to other domain private static final int col_urlLength = 10; // m 1 byte-length of complete URL private static final int col_urlComps = 11; // n 1 number of path components // dynamic properties private static final int col_typeofword = 12; // g 1 grammatical classification private static final int col_flags = 13; // z 4 b64-encoded appearance flags (24 bit, see definition below) private static final int col_hitcount = 14; // c 1 number of occurrences of this word in text private static final int col_posintext = 15; // t 2 first appearance of word in text private static final int col_posinphrase = 16; // r 1 position of word in its phrase private static final int col_posofphrase = 17; // o 1 number of the phrase where word appears private static final int col_reserve1 = 18; // i 1 reserve1 private static final int col_reserve2 = 19; // k 1 reserve2 // appearance flags, used in RWI entry // some names are derived from the Dublin Core Metadata tag set // the flags 0..23 are identical to the category flags in plasmaCondenser public static final int flag_app_dc_description= 24; // word appears in anchor description text (the reference to an url), or any alternative text field of a link public static final int flag_app_dc_title = 25; // word appears in title or headline or any description part public static final int flag_app_dc_creator = 26; // word appears in author public static final int flag_app_dc_subject = 27; // word appears in header tags or other descriptive part public static final int flag_app_dc_identifier = 28; // word appears in url or document identifier public static final int flag_app_emphasized = 29; // word is emphasized in text (i.e. bold, italics, special size) private final Row.Entry entry; protected WordReferenceRow( final byte[] urlHash, final int urlLength, // byte-length of complete URL final int urlComps, // number of path components final int titlewordcount,// length of description/length (longer are better?) final int hitcount, // how often appears this word in the text final int wordcount, // total number of words final int phrasecount, // total number of phrases final int posintext, // position of word in all words final int posinphrase, // position of word in its phrase final int posofphrase, // number of the phrase where word appears final long lastmodified, // last-modified time of the document where word appears final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short final byte[] language, // (guessed) language of document final char doctype, // type of document final int outlinksSame, // outlinks to same domain final int outlinksOther, // outlinks to other domain final Bitfield flags // attributes to the url and to the word according the url ) { assert (urlHash.length == 12) : "urlhash = " + ASCII.String(urlHash); this.entry = urlEntryRow.newEntry(); final int mddlm = MicroDate.microDateDays(lastmodified); final int mddct = MicroDate.microDateDays(updatetime); this.entry.setCol(col_urlhash, urlHash); this.entry.setCol(col_lastModified, mddlm); this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation this.entry.setCol(col_wordsInTitle, titlewordcount); this.entry.setCol(col_wordsInText, wordcount); this.entry.setCol(col_phrasesInText, phrasecount); this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); this.entry.setCol(col_language, (language == null || language.length != urlEntryRow.width(col_language)) ? WordReferenceVars.default_language : language); this.entry.setCol(col_llocal, outlinksSame); this.entry.setCol(col_lother, outlinksOther); this.entry.setCol(col_urlLength, urlLength); this.entry.setCol(col_urlComps, urlComps); this.entry.setCol(col_typeofword, new byte[]{(byte) 0}); // TODO: grammatical classification this.entry.setCol(col_flags, flags.bytes()); this.entry.setCol(col_hitcount, hitcount); this.entry.setCol(col_posintext, posintext); this.entry.setCol(col_posinphrase, posinphrase); this.entry.setCol(col_posofphrase, posofphrase); this.entry.setCol(col_reserve1, 0); this.entry.setCol(col_reserve2, 0); } /** * Constructor for WordReferences from title words or as template for content * words (with reduced number of input parameters, skipping the parameter * later set by setWord() for a WordReferenceRow template or not relevant if * used for words from title). */ public WordReferenceRow(final byte[] urlHash, final int urlLength, // byte-length of complete URL final int urlComps, // number of path components final int titlewordcount,// length of description/length (longer are better?) final int wordcount, // total number of words final int phrasecount, // total number of phrases final long lastmodified, // last-modified time of the document where word appears final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short final byte[] language, // (guessed) language of document final char doctype, // type of document final int outlinksSame, // outlinks to same domain final int outlinksOther // outlinks to other domain ) { assert (urlHash.length == 12) : "urlhash = " + ASCII.String(urlHash); this.entry = urlEntryRow.newEntry(); final int mddlm = MicroDate.microDateDays(lastmodified); final int mddct = MicroDate.microDateDays(updatetime); this.entry.setCol(col_urlhash, urlHash); this.entry.setCol(col_lastModified, mddlm); this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation this.entry.setCol(col_wordsInTitle, titlewordcount); this.entry.setCol(col_wordsInText, wordcount); this.entry.setCol(col_phrasesInText, phrasecount); this.entry.setCol(col_doctype, new byte[]{(byte) doctype}); this.entry.setCol(col_language, ((language == null) || (language.length != urlEntryRow.width(col_language))) ? WordReferenceVars.default_language : language); this.entry.setCol(col_llocal, outlinksSame); this.entry.setCol(col_lother, outlinksOther); this.entry.setCol(col_urlLength, urlLength); this.entry.setCol(col_urlComps, urlComps); this.entry.setCol(col_reserve1, 0); this.entry.setCol(col_reserve2, 0); } public void setWord(final Word word) { this.entry.setCol(col_typeofword, new byte[]{(byte) 0}); this.entry.setCol(col_flags, word.flags.bytes()); this.entry.setCol(col_hitcount, word.count); this.entry.setCol(col_posintext, word.posInText); this.entry.setCol(col_posinphrase, word.posInPhrase); this.entry.setCol(col_posofphrase, word.numOfPhrase); } public WordReferenceRow(final String external) { this.entry = urlEntryRow.newEntry(external, true); } private WordReferenceRow(final byte[] row) { this.entry = urlEntryRow.newEntry(row); } protected WordReferenceRow(final Row.Entry rentry) { // no cloning is necessary since there is no further manipulation after this initial instantiation this.entry = rentry; } @Override public WordReferenceRow clone() { final byte[] b = new byte[urlEntryRow.objectsize]; System.arraycopy(this.entry.bytes(), 0, b, 0, urlEntryRow.objectsize); return new WordReferenceRow(b); } @Override public String toPropertyForm() { return this.entry.toPropertyForm('=', true, true, false, false); } @Override public Entry toKelondroEntry() { return this.entry; } @Override public byte[] urlhash() { return this.entry.getColBytes(col_urlhash, true); } @Override public int virtualAge() { return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format } /** * @return date recalculated from MicroDateDays (accuracy = 1 Day, time always 0:00) */ @Override public long lastModified() { return MicroDate.reverseMicroDateDays(this.entry.getColLong(col_lastModified)); } /** * @return occurences of word in text (in the rang 0..255) */ @Override public int hitcount() { return (0xff & this.entry.getColByte(col_hitcount)); } /** * @return first positon of word in text */ @Override public int posintext() { int pos = (int) this.entry.getColLong(col_posintext); return pos; } /** * positions() is used to remember word positions for each query word of an * multi word search query. * WordReferenceRow is for one WordReference and has no means to return multiple positions * but is required by the interface. * @return null */ @Override public Collection<Integer> positions() { return null; } @Override public int posinphrase() { return (0xff & this.entry.getColByte(col_posinphrase)); } @Override public int posofphrase() { return (0xff & this.entry.getColByte(col_posofphrase)); } @Override public int wordsintext() { return (int) this.entry.getColLong(col_wordsInText); } @Override public int phrasesintext() { return (int) this.entry.getColLong(col_phrasesInText); } @Override public byte[] getLanguage() { return this.entry.getColBytes(col_language, true); } @Override public char getType() { return (char) this.entry.getColByte(col_doctype); } @Override public int wordsintitle() { return (0xff & this.entry.getColByte(col_wordsInTitle)); } @Override public int llocal() { return (0xff & this.entry.getColByte(col_llocal)); } @Override public int lother() { return (0xff & this.entry.getColByte(col_lother)); } @Override public int urllength() { return (0xff & this.entry.getColByte(col_urlLength)); } @Override public int urlcomps() { return (0xff & this.entry.getColByte(col_urlComps)); } @Override public Bitfield flags() { return new Bitfield(this.entry.getColBytes(col_flags, false)); } @Override public double termFrequency() { return (((double) hitcount()) / ((double) (wordsintext() + wordsintitle() + 1))); } @Override public String toString() { return toPropertyForm(); } @Override public boolean equals(final Object obj) { if (this == obj) return true; if (obj == null) return false; if (!(obj instanceof WordReferenceRow)) return false; final WordReferenceRow other = (WordReferenceRow) obj; return Base64Order.enhancedCoder.equal(urlhash(), other.urlhash()); } private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful @Override public int hashCode() { if (this.hashCache == Integer.MIN_VALUE) { this.hashCache = ByteArray.hashCode(urlhash()); } return this.hashCache; } @Override public void join(final Reference oe) { throw new UnsupportedOperationException(""); } @Override public String hosthash() { return ASCII.String(this.urlhash(), 6, 6); } }