// Word.java // (C) 2008 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 26.03.2008 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.kelondro.data.word; import java.util.Collection; import java.util.Locale; import net.yacy.cora.order.Base64Order; import net.yacy.cora.order.Digest; import net.yacy.cora.storage.ARC; import net.yacy.cora.storage.ConcurrentARC; import net.yacy.cora.storage.HandleSet; import net.yacy.cora.util.ConcurrentLog; import net.yacy.cora.util.SpaceExceededException; import net.yacy.kelondro.index.RowHandleSet; import net.yacy.kelondro.util.Bitfield; import net.yacy.kelondro.util.MemoryControl; public class Word { /** * this is the lenght(12) of the hash key that is used:<br> * - for seed hashes (this Object)<br> * - for word hashes (IndexEntry.wordHashLength)<br> * - for L-URL hashes (plasmaLURL.urlHashLength)<br><br> * these hashes all shall be generated by base64.enhancedCoder */ public static final int commonHashLength = 12; public static final Base64Order commonHashOrder = Base64Order.enhancedCoder; private static final int hashCacheSize = Math.max(20000, Math.min(200000, (int) (MemoryControl.available() / 40000L))); private static ARC<String, byte[]> hashCache = null; static { try { hashCache = new ConcurrentARC<String, byte[]>(hashCacheSize, Math.min(32, 2 * Runtime.getRuntime().availableProcessors())); ConcurrentLog.info("Word", "hashCache.size = " + hashCacheSize); } catch (final OutOfMemoryError e) { hashCache = new ConcurrentARC<String, byte[]>(1000, Math.min(8, 1 + Runtime.getRuntime().availableProcessors())); ConcurrentLog.info("Word", "hashCache.size = " + 1000); } } // object carries statistics for words and sentences public int count; // number of occurrences public int posInText; // unique handle, is initialized with first word position in text public int posInPhrase; // position of word in phrase public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100 public Bitfield flags; // the flag bits for each word public Word(final int handle, final int pip, final int nop) { this.count = 1; this.posInText = handle; this.posInPhrase = pip; this.numOfPhrase = nop; this.flags = null; } public static void clearCache() { hashCache.clear(); } public void inc() { this.count++; } public int occurrences() { return this.count; } @Override public String toString() { // this is here for debugging return "{count=" + this.count + ", posInText=" + this.posInText + ", posInPhrase=" + this.posInPhrase + ", numOfPhrase=" + this.numOfPhrase + "}"; } // static methods public static byte[] word2hash(final StringBuilder word) { return word2hash(word.toString()); } private final static byte lowByte = Base64Order.alpha_enhanced[0]; private final static byte highByte = Base64Order.alpha_enhanced[Base64Order.alpha_enhanced.length - 1]; public static boolean isPrivate(byte[] hash) { return hash[0] == highByte && hash[1] == highByte && hash[2] == highByte && hash[3] == highByte && hash[4] == highByte; } // create a word hash public static final byte[] word2hash(final String word) { final String wordlc = word.toLowerCase(Locale.ENGLISH); byte[] h = hashCache.get(wordlc); if (h != null) return h; // calculate the hash h = commonHashOrder.encodeSubstring(Digest.encodeMD5Raw(wordlc), commonHashLength); while (h[0] == highByte && h[1] == highByte && h[2] == highByte && h[3] == highByte && h[4] == highByte) { // ensure that word hashes do not start with hash '_____' which is a key for an extra hash range for private usage on the local peer // statistically we are inside this loop only every 2^^30 calls of word2hash (which means almost never) System.arraycopy(h, 1, h, 0, commonHashLength - 1); h[commonHashLength - 1] = lowByte; } assert h[2] != '@'; if (MemoryControl.shortStatus()) { hashCache.clear(); } else { //hashCache.putIfAbsent(wordlc, h); // prevent expensive MD5 computation and encoding hashCache.insertIfAbsent(wordlc, h); // prevent expensive MD5 computation and encoding } return h; } public final static byte PRIVATE_TYPE_COPY = 'C'; // used for a private local copy of the index public final static byte PRIVATE_TYPE_PHONETIC = 'K'; // used for ColognePhonetics public static final byte[] hash2private(final byte[] hash, byte privateType) { byte[] p = new byte[commonHashLength]; p[0] = highByte; p[1] = highByte; p[2] = highByte; p[3] = highByte; p[4] = highByte; p[5] = privateType; System.arraycopy(hash, 0, p, 6, commonHashLength - 6); // 36 bits left for private hashes should be enough return p; } public static final HandleSet words2hashesHandles(final Collection<String> words) { final HandleSet hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.size()); for (final String word: words) try { hashes.put(word2hash(word)); } catch (final SpaceExceededException e) { ConcurrentLog.logException(e); return hashes; } hashes.optimize(); return hashes; } public static final HandleSet words2hashesHandles(final String[] words) { final HandleSet hashes = new RowHandleSet(WordReferenceRow.urlEntryRow.primaryKeyLength, WordReferenceRow.urlEntryRow.objectOrder, words.length); for (final String word: words) try { hashes.put(word2hash(word)); } catch (final SpaceExceededException e) { ConcurrentLog.logException(e); return hashes; } hashes.optimize(); return hashes; } }