/** * WarcImporter.java * (C) 2017 by reger24; https://github.com/reger24 * * This is a part of YaCy, a peer-to-peer based web search engine * * LICENSE * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program. * If not, see <http://www.gnu.org/licenses/>. */ package net.yacy.document.importer; import java.io.File; import java.io.FileInputStream; import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import net.yacy.cora.document.id.DigestURL; import net.yacy.cora.protocol.HeaderFramework; import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.protocol.ResponseHeader; import net.yacy.cora.util.ByteBuffer; import net.yacy.cora.util.ConcurrentLog; import net.yacy.crawler.retrieval.Request; import net.yacy.crawler.retrieval.Response; import net.yacy.document.TextParser; import net.yacy.search.Switchboard; import net.yacy.server.http.ChunkedInputStream; import org.jwat.common.HeaderLine; import org.jwat.common.HttpHeader; import org.jwat.warc.WarcConstants; import org.jwat.warc.WarcReader; import org.jwat.warc.WarcReaderFactory; import org.jwat.warc.WarcRecord; /** * Web Archive file format reader to process the warc archive content (responses) * * Warc format specification ISO 28500 * https://archive.org/details/WARCISO28500Version1Latestdraft * http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf * * http://archive-access.sourceforge.net/warc/warc_file_format-0.9.html * http://archive-access.sourceforge.net/warc/ */ public class WarcImporter extends Thread implements Importer { static public Importer job; // static object to assure only one importer is running (if started from a servlet, this object is used to store the thread) private final InputStream source; // current input warc archive private String name; // file name of input source private int recordCnt; // number of responses indexed (for statistic) private long startTime; // (for statistic) private final long sourceSize; // length of the input source (for statistic) private long consumed; // bytes consumed from input source (for statistic) public WarcImporter(InputStream f) { source = f; recordCnt = 0; sourceSize = -1; } /** * Init the WarcImporter with input stream with a informational filename or * url als info for calls to the importer methode source() which returns * the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream) * @param f the input stream to read the warc archive from * @param urlinfo a info like the url or the filename */ public WarcImporter (InputStream f, String urlinfo) { this(f); name = urlinfo; } public WarcImporter(File f) throws FileNotFoundException{ name = f.getName(); sourceSize = f.length(); source = new FileInputStream(f); } /** * Reads a Warc file and adds all contained responses to the index. * The reader automatically handles plain or gzip'd warc files * * @param f inputstream for the warc file * @throws IOException */ public void indexWarcRecords(InputStream f) throws IOException { byte[] content; job = this; startTime = System.currentTimeMillis(); WarcReader localwarcReader = WarcReaderFactory.getReader(f); WarcRecord wrec = localwarcReader.getNextRecord(); while (wrec != null) { HeaderLine hl = wrec.getHeader(WarcConstants.FN_WARC_TYPE); if (hl != null && hl.value.equals(WarcConstants.RT_RESPONSE)) { // filter responses hl = wrec.getHeader(WarcConstants.FN_WARC_TARGET_URI); DigestURL location = new DigestURL(hl.value); HttpHeader http = wrec.getHttpHeader(); if (http != null && http.statusCode == 200) { // process http response header OK (status 200) if (TextParser.supportsMime(http.contentType) == null) { // check availability of parser InputStream istream = wrec.getPayloadContent(); hl = http.getHeader(HeaderFramework.TRANSFER_ENCODING); if (hl != null && hl.value.contains("chunked")) { // because chunked stream.read doesn't read source fully, make sure all chunks are read istream = new ChunkedInputStream(istream); final ByteBuffer bbuffer = new ByteBuffer(); int c; while ((c = istream.read()) >= 0) { bbuffer.append(c); } content = bbuffer.getBytes(); } else { content = new byte[(int) http.getPayloadLength()]; istream.read(content, 0, content.length); } istream.close(); RequestHeader requestHeader = new RequestHeader(); ResponseHeader responseHeader = new ResponseHeader(http.statusCode); for (HeaderLine hx : http.getHeaderList()) { // include all original response headers for parser responseHeader.put(hx.name, hx.value); } final Request request = new Request( null, location, requestHeader.referer() == null ? null : requestHeader.referer().hash(), "warc", responseHeader.lastModified(), Switchboard.getSwitchboard().crawler.defaultRemoteProfile.handle(), // use remote profile (to index text & media, without writing to cache 0, Switchboard.getSwitchboard().crawler.defaultRemoteProfile.timezoneOffset()); final Response response = new Response( request, requestHeader, responseHeader, Switchboard.getSwitchboard().crawler.defaultRemoteProfile, false, content ); Switchboard.getSwitchboard().toIndexer(response); recordCnt++; } } } this.consumed = localwarcReader.getConsumed(); wrec = localwarcReader.getNextRecord(); } localwarcReader.close(); ConcurrentLog.info("WarcImporter", "Indexed " + recordCnt + " documents"); job = null; } @Override public void run() { try { this.indexWarcRecords(this.source); } catch (IOException ex) { ConcurrentLog.info("WarcImporter", ex.getMessage()); } } /** * Filename of the input source * @return */ @Override public String source() { return this.name; } /** * Number of responses (pages) indexed * @return */ @Override public int count() { return this.recordCnt; } /** * Indexed responses per second * @return */ @Override public int speed() { if (this.recordCnt == 0) return 0; return (int) (this.recordCnt / Math.max(0L, runningTime() )); } /** * Duration in seconds running, working on the current import source * @return duration in seconds */ @Override public long runningTime() { return (System.currentTimeMillis() - this.startTime) / 1000L; } /** * Estimate on time remaining calculated from length of input source and * processed bytes. * @return duration in seconds */ @Override public long remainingTime() { if (this.consumed == 0) { return 0; } else { long speed = this.consumed / runningTime(); return (this.sourceSize - this.consumed) / speed; } } @Override public String status() { return ""; } }