// WordReferenceFactory.java // (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany // first published 09.04.2009 on http://yacy.net // // This is a part of YaCy, a peer-to-peer based web search engine // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // LICENSE // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA package net.yacy.kelondro.data.word; import java.io.Serializable; import java.util.Collections; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.SortedMap; import java.util.TreeMap; import net.yacy.cora.document.encoding.ASCII; import net.yacy.cora.util.ByteBuffer; import net.yacy.kelondro.index.Row; import net.yacy.kelondro.index.Row.Entry; import net.yacy.kelondro.rwi.ReferenceContainer; import net.yacy.kelondro.rwi.ReferenceFactory; public class WordReferenceFactory implements ReferenceFactory<WordReference>, Serializable { private static final long serialVersionUID=-7168706947127349876L; @Override public WordReference produceSlow(final Entry e) { return new WordReferenceRow(e); } @Override public WordReference produceFast(final WordReference r, final boolean local) { if (r instanceof WordReferenceVars) return r; return new WordReferenceVars(r, local); } @Override public Row getRow() { return WordReferenceRow.urlEntryRow; } /** * create an index abstract for a given WordReference ReferenceContainer * This extracts all the host hashes from a reference Container and returns a byte buffer * with a compressed representation of the host references * @param <ReferenceType> * @param inputContainer * @param excludeContainer * @param maxtime * @return */ public static final <ReferenceType extends WordReference> ByteBuffer compressIndex(final ReferenceContainer<WordReference> inputContainer, final ReferenceContainer<WordReference> excludeContainer, final long maxtime) { // collect references according to domains final long timeout = maxtime == Long.MAX_VALUE ? Long.MAX_VALUE : (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime; final TreeMap<String, StringBuilder> doms = new TreeMap<String, StringBuilder>(); synchronized (inputContainer) { final Iterator<WordReference> i = inputContainer.entries(); WordReference iEntry; String dom, mod; StringBuilder paths; while (i.hasNext()) { iEntry = i.next(); if ((excludeContainer != null) && (excludeContainer.getReference(iEntry.urlhash()) != null)) continue; // do not include urls that are in excludeContainer dom = (iEntry instanceof WordReferenceVars) ? ((WordReferenceVars) iEntry).hosthash() : ASCII.String(iEntry.urlhash(), 6, 6); mod = ASCII.String(iEntry.urlhash(), 0, 6); if ((paths = doms.get(dom)) == null) { doms.put(dom, new StringBuilder(30).append(mod)); } else { doms.put(dom, paths.append(mod)); } if (System.currentTimeMillis() > timeout) break; } } // construct a result string final ByteBuffer bb = new ByteBuffer(inputContainer.size() * 6); bb.append('{'); final Iterator<Map.Entry<String, StringBuilder>> i = doms.entrySet().iterator(); Map.Entry<String, StringBuilder> entry; while (i.hasNext()) { entry = i.next(); bb.append(entry.getKey()); bb.append(':'); bb.append(entry.getValue().toString()); if (System.currentTimeMillis() > timeout) break; if (i.hasNext()) bb.append(','); } bb.append('}'); return bb; } /** * decompress an index abstract that was generated from a word index and transmitted over a network connection * @param ci * @param peerhash * @return a urlhash -> peerlist map: this shows in which peers an url is stored */ public static final SortedMap<String, Set<String>> decompressIndex(ByteBuffer ci, final String peerhash) { SortedMap<String, Set<String>> target = Collections.synchronizedSortedMap(new TreeMap<String, Set<String>>()); // target is a mapping from url-hashes to a string of peer-hashes if (ci.byteAt(0) != '{' || ci.byteAt(ci.length() - 1) != '}') return target; //System.out.println("DEBUG-DECOMPRESS: input is " + ci.toString()); ci = ci.trim(1, ci.length() - 2); String dom, url; Set<String> peers; StringBuilder urlsb; while (ci.length() >= 13 && ci.byteAt(6) == ':') { assert ci.length() >= 6 : "ci.length() = " + ci.length(); dom = ci.toStringBuilder(0, 6, 6).toString(); ci.trim(7); while (!ci.isEmpty() && ci.byteAt(0) != ',') { assert ci.length() >= 6 : "ci.length() = " + ci.length(); urlsb = ci.toStringBuilder(0, 6, 12); urlsb.append(dom); url = urlsb.toString(); ci.trim(6); peers = target.get(url); if (peers == null) { peers = new HashSet<String>(); target.put(url, peers); } peers.add(peerhash); //System.out.println("DEBUG-DECOMPRESS: " + url + ":" + target.get(url)); } if (ci.byteAt(0) == ',') ci.trim(1); } //System.out.println("DEBUG-DECOMPRESS: " + target); return target; } }