// Word.java
// (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 26.03.2008 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.kelondro.data.word;
import java.util.Collection;
import java.util.Locale;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.order.Digest;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.index.RowHandleSet;
import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.MemoryControl;
public class Word {
/**
* this is the lenght(12) of the hash key that is used:<br>
* - for seed hashes (this Object)<br>
* - for word hashes (IndexEntry.wordHashLength)<br>
* - for L-URL hashes (plasmaLURL.urlHashLength)<br><br>
* these hashes all shall be generated by base64.enhancedCoder
*/
public static final int commonHashLength = 12;
public static final Base64Order commonHashOrder = Base64Order.enhancedCoder;
private static final int hashCacheSize = Math.max(20000, Math.min(200000, (int) (MemoryControl.available() / 40000L)));
private static ARC<String, byte[]> hashCache = null;
static {
try {
hashCache = new ConcurrentARC<String, byte[]>(hashCacheSize, Math.min(32, 2 * Runtime.getRuntime().availableProcessors()));
ConcurrentLog.info("Word", "hashCache.size = " + hashCacheSize);
} catch (final OutOfMemoryError e) {
hashCache = new ConcurrentARC<String, byte[]>(1000, Math.min(8, 1 + Runtime.getRuntime().availableProcessors()));
ConcurrentLog.info("Word", "hashCache.size = " + 1000);
}
}
// object carries statistics for words and sentences
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with first word position in text
public int posInPhrase; // position of word in phrase
public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
public Bitfield flags; // the flag bits for each word
public Word(final int handle, final int pip, final int nop) {
this.count = 1;
this.posInText = handle;
this.posInPhrase = pip;
this.numOfPhrase = nop;
this.flags = null;
}
public static void clearCache() {
hashCache.clear();
}
public void inc() {
this.count++;
}
public int occurrences() {
return this.count;
}
@Override
public String toString() {
// this is here for debugging
return "{count=" + this.count + ", posInText=" + this.posInText + ", posInPhrase=" + this.posInPhrase + ", numOfPhrase=" + this.numOfPhrase + "}";
}
// static methods
public static byte[] word2hash(final StringBuilder word) {
return word2hash(word.toString());
}
private final static byte lowByte = Base64Order.alpha_enhanced[0];
private final static byte highByte = Base64Order.alpha_enhanced[Base64Order.alpha_enhanced.length - 1];
public static boolean isPrivate(byte[] hash) {
return hash[0] == highByte && hash[1] == highByte && hash[2] == highByte && hash[3] == highByte && hash[4] == highByte;
}
// create a word hash
public static final byte[] word2hash(final String word) {
final String wordlc = word.toLowerCase(Locale.ENGLISH);
byte[] h = hashCache.get(wordlc);
if (h != null) return h;
// calculate the hash
h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength);
while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) {
// ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer
// statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never)
System.arraycopy(h, 1, h, 0, commonHashLength - 1);
h[commonHashLength - 1] = lowByte;
}
assert h[2] != '@';
if (MemoryControl.shortStatus()) {
hashCache.clear();
} else {
//hashCache.putIfAbsent(wordlc, h); // prevent expensive MD5 computation and encoding
hashCache.insertIfAbsent(wordlc, h); // prevent expensive MD5 computation and encoding
}
return h;
}
public final static byte PRIVATE_TYPE_COPY = 'C'; // used for a private local copy of the index
public final static byte PRIVATE_TYPE_PHONETIC = 'K'; // used for ColognePhonetics
public static final byte[] hash2private(final byte[] hash, byte privateType) {
byte[] p = new byte[commonHashLength];
p[0] = highByte; p[1] = highByte; p[2] = highByte; p[3] = highByte; p[4] = highByte; p[5] = privateType;
System.arraycopy(hash, 0, p, 6, commonHashLength - 6); // 36 bits left for private hashes should be enough
return p;
}
public static final HandleSet words2hashesHandles(final Collection<String> words) {
final HandleSet hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size());
for (final String word: words)
try {
hashes.put(word2hash(word));
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
return hashes;
}
hashes.optimize();
return hashes;
}
public static final HandleSet words2hashesHandles(final String[] words) {
final HandleSet hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.length);
for (final String word: words)
try {
hashes.put(word2hash(word));
} catch (final SpaceExceededException e) {
ConcurrentLog.logException(e);
return hashes;
}
hashes.optimize();
return hashes;
}
}