// WordReferenceVars.java // (C) 2007 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 07.11.2007 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.kelondro.data.word; import java.util.Collection; import java.util.Comparator; import java.util.Queue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.LinkedBlockingQueue; import net.yacy.cora.date.MicroDate; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.document.encoding.UTF8; import net.yacy.cora.order.Base64Order; import net.yacy.cora.util.ByteArray; import net.yacy.cora.util.ConcurrentLog; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.rwi.AbstractReference; import net.yacy.kelondro.rwi.Reference; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.workflow.WorkflowProcessor; public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable, Comparable<WordReferenceVars>, Comparator<WordReferenceVars> { /** * object for termination of concurrent blocking queue processing */ public static final WordReferenceVars poison = new WordReferenceVars(); protected static final byte[] default_language = UTF8.getBytes("en"); private final Bitfield flags; private long lastModified; private final String language; public final byte[] urlHash; private String hostHash = null; private final char type; private int hitcount, // how often appears this word in the text llocal, lother, phrasesintext, posintext, // word position in text posinphrase, posofphrase, urlcomps, urllength, wordsintext, wordsintitle; private int virtualAge; private Queue<Integer> positions; // word positons of joined references private double termFrequency; private final boolean local; public WordReferenceVars( final byte[] urlHash, final int urlLength, // byte-length of complete URL final int urlComps, // number of path components final int titleLength, // length of description/length (longer are better?) final int hitcount, // how often appears this word in the text final int wordcount, // total number of words final int phrasecount, // total number of phrases final int posintext, // first position of word in text final Queue<Integer> ps, // positions of words that are joined into the reference final int posinphrase, // position of word in its phrase final int posofphrase, // number of the phrase where word appears final long lastmodified, // last-modified time of the document where word appears String language, // (guessed) language of document final char doctype, // type of document final int outlinksSame, // outlinks to same domain final int outlinksOther, // outlinks to other domain final Bitfield flags, // attributes to the url and to the word according the url final double termfrequency ) { //final int mddct = MicroDate.microDateDays(updatetime); this.flags = flags; //this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2); this.lastModified = lastmodified; this.language = language; this.urlHash = urlHash; this.type = doctype; this.hitcount = hitcount; this.llocal = outlinksSame; this.lother = outlinksOther; this.phrasesintext = phrasecount; if (ps != null && !ps.isEmpty()) { this.positions = new LinkedBlockingQueue<Integer>(); for (final Integer i : ps) this.positions.add(i); } else { this.positions = null; } this.posinphrase = posinphrase; this.posintext = posintext; this.posofphrase = posofphrase; this.urlcomps = urlComps; this.urllength = urlLength; this.virtualAge = -1; // compute that later this.wordsintext = wordcount; this.wordsintitle = titleLength; this.termFrequency = termfrequency; this.local = true; } public WordReferenceVars(final WordReference e, boolean local) { this.flags = e.flags(); //this.freshUntil = e.freshUntil(); this.lastModified = e.lastModified(); this.language = ASCII.String(e.getLanguage()); this.urlHash = e.urlhash(); this.type = e.getType(); this.hitcount = e.hitcount(); this.llocal = e.llocal(); this.lother = e.lother(); this.phrasesintext = e.phrasesintext(); if (e.positions() != null && !e.positions().isEmpty()) { this.positions = new LinkedBlockingQueue<Integer>(); for (final Integer i: e.positions()) this.positions.add(i); } else { this.positions = null; } this.posinphrase = e.posinphrase(); this.posintext = e.posintext(); this.posofphrase = e.posofphrase(); this.urlcomps = e.urlcomps(); this.urllength = e.urllength(); this.virtualAge = e.virtualAge(); this.wordsintext = e.wordsintext(); this.wordsintitle = e.wordsintitle(); this.termFrequency = e.termFrequency(); this.local = local; } /** * initializer for special poison object */ public WordReferenceVars() { this.flags = null; this.lastModified = 0; this.language = null; this.urlHash = null; this.type = ' '; this.hitcount = 0; this.llocal = 0; this.lother = 0; this.phrasesintext = 0; this.positions = null; this.posinphrase = 0; this.posintext = 0; this.posofphrase = 0; this.urlcomps = 0; this.urllength = 0; this.virtualAge = 0; this.wordsintext = 0; this.wordsintitle = 0; this.termFrequency = 0.0; this.local = true; } @Override public WordReferenceVars clone() { final WordReferenceVars c = new WordReferenceVars( this.urlHash, this.urllength, this.urlcomps, this.wordsintitle, this.hitcount, this.wordsintext, this.phrasesintext, this.posintext, this.positions, this.posinphrase, this.posofphrase, this.lastModified, this.language, this.type, this.llocal, this.lother, this.flags, this.termFrequency); return c; } @Override public Bitfield flags() { return this.flags; } @Override public byte[] getLanguage() { return ASCII.getBytes(this.language); } @Override public char getType() { return this.type; } /** * How often appears this word in the text * @return */ @Override public int hitcount() { return this.hitcount; } @Override public long lastModified() { return this.lastModified; } @Override public int llocal() { return this.llocal; } @Override public int lother() { return this.lother; } @Override public int phrasesintext() { return this.phrasesintext; } @Override public int posinphrase() { return this.posinphrase; } /** * First word position in text. * @return min position */ @Override public int posintext() { return this.posintext; } /** * Word positions for joined references (for multi word queries). * @see posintext() * @return the word positions of the joined references */ @Override public Collection<Integer> positions() { return this.positions; } @Override public int posofphrase() { return this.posofphrase; } private WordReferenceRow toRowEntry() { return new WordReferenceRow( this.urlHash, this.urllength, // byte-length of complete URL this.urlcomps, // number of path components this.wordsintitle, // length of description/length (longer are better?) this.hitcount, // how often appears this word in the text this.wordsintext, // total number of words this.phrasesintext, // total number of phrases this.posintext, // position of word in all words (WordReferenceRow stores first position in text) this.posinphrase, // position of word in its phrase this.posofphrase, // number of the phrase where word appears this.lastModified, // last-modified time of the document where word appears System.currentTimeMillis(), // update time; ASCII.getBytes(this.language), // (guessed) language of document this.type, // type of document this.llocal, // outlinks to same domain this.lother, // outlinks to other domain this.flags // attributes to the url and to the word according the url ); } @Override public Entry toKelondroEntry() { return toRowEntry().toKelondroEntry(); } @Override public String toPropertyForm() { return toRowEntry().toPropertyForm(); } @Override public byte[] urlhash() { return this.urlHash; } @Override public String hosthash() { if (this.hostHash != null) return this.hostHash; this.hostHash = ASCII.String(this.urlHash, 6, 6); return this.hostHash; } @Override public int urlcomps() { return this.urlcomps; } @Override public int urllength() { return this.urllength; } @Override public int virtualAge() { if (this.virtualAge > 0) return this.virtualAge; this.virtualAge = MicroDate.microDateDays(this.lastModified); return this.virtualAge; } @Override public int wordsintext() { return this.wordsintext; } @Override public int wordsintitle() { return this.wordsintitle; } @Override public double termFrequency() { if (this.termFrequency == 0.0) this.termFrequency = (((double) hitcount()) / ((double) (wordsintext() + wordsintitle() + 1))); return this.termFrequency; } public boolean local() { return this.local; } public final void min(final WordReferenceVars other) { if (other == null) return; int v; long w; double d; if (this.hitcount > (v = other.hitcount)) this.hitcount = v; if (this.llocal > (v = other.llocal)) this.llocal = v; if (this.lother > (v = other.lother)) this.lother = v; if (virtualAge() > (v = other.virtualAge())) this.virtualAge = v; if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v; if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v; if (this.posintext > (v = other.posintext)) this.posintext = v; // calculate and remember min distance if (this.positions != null || other.positions != null) { int odist = other.distance(); int dist = this.distance(); if (odist > 0 && odist < dist) { if (this.positions == null) { this.positions = new LinkedBlockingQueue<Integer>(); } else { this.positions.clear(); } this.positions.add(this.posintext + odist); } } if (this.posinphrase > (v = other.posinphrase)) this.posinphrase = v; if (this.posofphrase > (v = other.posofphrase)) this.posofphrase = v; if (this.lastModified > (w = other.lastModified)) this.lastModified = w; //if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w; if (this.urllength > (v = other.urllength)) this.urllength = v; if (this.urlcomps > (v = other.urlcomps)) this.urlcomps = v; if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v; if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d; } public final void max(final WordReferenceVars other) { if (other == null) return; int v; long w; double d; if (this.hitcount < (v = other.hitcount)) this.hitcount = v; if (this.llocal < (v = other.llocal)) this.llocal = v; if (this.lother < (v = other.lother)) this.lother = v; if (virtualAge() < (v = other.virtualAge())) this.virtualAge = v; if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v; if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v; if (this.posintext < (v = other.posintext)) this.posintext = v; // calculate and remember max distance if (this.positions != null || other.positions != null) { int odist = other.distance(); int dist = this.distance(); if (odist > 0 && odist > dist) { if (this.positions == null) { this.positions = new LinkedBlockingQueue<Integer>(); } else { this.positions.clear(); } this.positions.add(this.posintext + odist); } } if (this.posinphrase < (v = other.posinphrase)) this.posinphrase = v; if (this.posofphrase < (v = other.posofphrase)) this.posofphrase = v; if (this.lastModified < (w = other.lastModified)) this.lastModified = w; //if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w; if (this.urllength < (v = other.urllength)) this.urllength = v; if (this.urlcomps < (v = other.urlcomps)) this.urlcomps = v; if (this.wordsintitle < (v = other.wordsintitle)) this.wordsintitle = v; if (this.termFrequency < (d = other.termFrequency)) this.termFrequency = d; } /** * joins two entries into one entry * * Main usage is on multi word searches to combine the position values for ranking and word distance calculation, * A Join is valid for the same url. * @param r WordReference */ @Override public void join(final Reference r) { final WordReference oe = (WordReference) r; // choose min posintext (for > 0) if (this.posintext > 0 && oe.posintext() > 0) { if (this.posintext > oe.posintext()) { this.addPosition(this.posintext); // remember larger position (for distance calculation) this.posintext = oe.posintext(); } else { this.addPosition(oe.posintext()); // remember other position (for distance calculation) } } else if (this.posintext == 0) { this.posintext = oe.posintext(); } // join phrase // this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0; // this.posofphrase = Math.min(this.posofphrase, oe.posofphrase()); final int oePosofphrase = oe.posofphrase(); if (this.posofphrase == oePosofphrase) { this.posinphrase = Math.min(this.posinphrase, oe.posinphrase()); } else if (this.posofphrase > oePosofphrase) { this.posofphrase = oePosofphrase; // choose min posofphrase this.posinphrase = oe.posinphrase(); // with corresponding posinphrase } // combine term frequency this.termFrequency = this.termFrequency + oe.termFrequency(); this.wordsintext = Math.max(this.wordsintext, oe.wordsintext()); // as it is same url asume the word count to be the max this.wordsintitle = Math.max(this.wordsintitle, oe.wordsintitle()); this.phrasesintext = Math.max(this.phrasesintext, oe.phrasesintext()); this.hitcount = Math.max(this.hitcount, oe.hitcount()); } @Override public boolean equals(final Object obj) { if (this == obj) return true; if (obj == null) return false; if (!(obj instanceof WordReferenceVars)) return false; final WordReferenceVars other = (WordReferenceVars) obj; return Base64Order.enhancedCoder.equal(this.urlHash, other.urlHash); } private int hashCache = Integer.MIN_VALUE; // if this is used in a compare method many times, a cache is useful @Override public int hashCode() { if (this.hashCache == Integer.MIN_VALUE) { this.hashCache = ByteArray.hashCode(this.urlHash); } return this.hashCache; } @Override public int compareTo(final WordReferenceVars o) { return Base64Order.enhancedCoder.compare(this.urlHash, o.urlhash()); } @Override public int compare(final WordReferenceVars o1, final WordReferenceVars o2) { return o1.compareTo(o2); } /** * Add a position for word distance calculation to the list if position > 0 * @param position */ public void addPosition(final int position) { if (this.positions == null && position > 0) this.positions = new LinkedBlockingQueue<Integer>(); if (position > 0) this.positions.add(position); } /** * transform a reference container into a stream of parsed entries * @param container * @return a blocking queue filled with WordReferenceVars that is still filled when the object is returned */ public static BlockingQueue<WordReferenceVars> transform(final ReferenceContainer<WordReference> container, final long maxtime, final boolean local) { final LinkedBlockingQueue<WordReferenceVars> vars = new LinkedBlockingQueue<WordReferenceVars>(); if (container.size() <= 100) { // transform without concurrency to omit thread creation overhead for (final Row.Entry entry: container) { try { vars.put(new WordReferenceVars(new WordReferenceRow(entry), local)); } catch (final InterruptedException e) {} } try { vars.put(WordReferenceVars.poison); } catch (final InterruptedException e) {} return vars; } final Thread distributor = new TransformDistributor(container, vars, maxtime, local); distributor.start(); // return the resulting queue while the processing queues are still working return vars; } private static class TransformDistributor extends Thread { private ReferenceContainer<WordReference> container; private BlockingQueue<WordReferenceVars> out; private long maxtime; private final boolean local; private TransformDistributor(final ReferenceContainer<WordReference> container, final BlockingQueue<WordReferenceVars> out, final long maxtime, final boolean local) { super("WordReferenceVars.TransformDistributor"); this.container = container; this.out = out; this.maxtime = maxtime; this.local = local; } @Override public void run() { // start the transformation threads final int cores0 = Math.min(WorkflowProcessor.availableCPU, this.container.size() / 100) + 1; final TransformWorker[] worker = new TransformWorker[cores0]; for (int i = 0; i < cores0; i++) { worker[i] = new TransformWorker(this.out, this.maxtime, this.local); worker[i].start(); } long timeout = this.maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + this.maxtime; // fill the queue int p = this.container.size(); while (p > 0) { p--; worker[p % cores0].add(this.container.get(p, false)); if (p % 100 == 0 && System.currentTimeMillis() > timeout) { ConcurrentLog.warn("TransformDistributor", "distribution of WordReference entries to worker queues ended with timeout = " + this.maxtime); break; } } // insert poison to stop the queues for (int i = 0; i < cores0; i++) { worker[i].add(WordReferenceRow.poisonRowEntry); } // wait for the worker to terminate because we want to place a poison entry into the out queue afterwards for (int i = 0; i < cores0; i++) { try { worker[i].join(); } catch (final InterruptedException e) { } } this.out.add(WordReferenceVars.poison); } } private static class TransformWorker extends Thread { private BlockingQueue<Row.Entry> in; private BlockingQueue<WordReferenceVars> out; private long maxtime; private final boolean local; private TransformWorker(final BlockingQueue<WordReferenceVars> out, final long maxtime, final boolean local) { super("WordReferenceVars.TransformWorker"); this.in = new LinkedBlockingQueue<Row.Entry>(); this.out = out; this.maxtime = maxtime; this.local = local; } private void add(final Row.Entry entry) { try { this.in.put(entry); } catch (final InterruptedException e) { } } @Override public void run() { Row.Entry entry; long timeout = this.maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : System.currentTimeMillis() + this.maxtime; try { while ((entry = this.in.take()) != WordReferenceRow.poisonRowEntry) { this.out.put(new WordReferenceVars(new WordReferenceRow(entry), local)); if (System.currentTimeMillis() > timeout) { ConcurrentLog.warn("TransformWorker", "normalization of row entries from row to vars ended with timeout = " + this.maxtime); break; } } } catch (final InterruptedException e) {} } } }