WarcImporter.java example

Explorer
yacy_search_server-master
/**
 * WarcImporter.java
 * (C) 2017 by reger24; https://github.com/reger24
 *
 * This is a part of YaCy, a peer-to-peer based web search engine
 *
 * LICENSE
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.
 * If not, see <http://www.gnu.org/licenses/>.
 */
package net.yacy.document.importer;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Request;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.TextParser;
import net.yacy.search.Switchboard;
import net.yacy.server.http.ChunkedInputStream;
import org.jwat.common.HeaderLine;
import org.jwat.common.HttpHeader;
import org.jwat.warc.WarcConstants;
import org.jwat.warc.WarcReader;
import org.jwat.warc.WarcReaderFactory;
import org.jwat.warc.WarcRecord;

/**
 * Web Archive file format reader to process the warc archive content (responses)
 *
 * Warc format specification ISO 28500
 * https://archive.org/details/WARCISO28500Version1Latestdraft
 * http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf
 *
 * http://archive-access.sourceforge.net/warc/warc_file_format-0.9.html
 * http://archive-access.sourceforge.net/warc/
 */
public class WarcImporter extends Thread implements Importer {

    static public Importer job; // static object to assure only one importer is running (if started from a servlet, this object is used to store the thread)

    private final InputStream source; // current input warc archive
    private String name; // file name of input source
    
    private int recordCnt; // number of responses indexed (for statistic)
    private long startTime; // (for statistic)
    private final long sourceSize; // length of the input source (for statistic)
    private long consumed; // bytes consumed from input source (for statistic)

    public WarcImporter(InputStream f) {
        source = f;
        recordCnt = 0;
        sourceSize = -1;
    }

    /**
     * Init the WarcImporter with input stream with a informational filename or
     * url als info for calls to the importer methode source() which returns
     * the urlinfo. Otherwise this methode is equivalent to WarchImporter(inputstream)
     * @param f the input stream to read the warc archive from
     * @param urlinfo a info like the url or the filename
     */
    public WarcImporter (InputStream f, String urlinfo) {
        this(f);
        name = urlinfo;
    }

    public WarcImporter(File f) throws FileNotFoundException{
       name = f.getName();
       sourceSize = f.length();
       source = new FileInputStream(f);
    }

    /**
     * Reads a Warc file and adds all contained responses to the index.
     * The reader automatically handles plain or gzip'd warc files
     *
     * @param f inputstream for the warc file
     * @throws IOException
     */
    public void indexWarcRecords(InputStream f) throws IOException {

        byte[] content;
        job = this;
        startTime = System.currentTimeMillis();

        WarcReader localwarcReader = WarcReaderFactory.getReader(f);
        WarcRecord wrec = localwarcReader.getNextRecord();
        while (wrec != null) {

            HeaderLine hl = wrec.getHeader(WarcConstants.FN_WARC_TYPE);
            if (hl != null && hl.value.equals(WarcConstants.RT_RESPONSE)) { // filter responses

                hl = wrec.getHeader(WarcConstants.FN_WARC_TARGET_URI);
                DigestURL location = new DigestURL(hl.value);

                HttpHeader http = wrec.getHttpHeader();

                if (http != null && http.statusCode == 200) { // process http response header OK (status 200)

                    if (TextParser.supportsMime(http.contentType) == null) { // check availability of parser

                        InputStream istream = wrec.getPayloadContent();
                        hl = http.getHeader(HeaderFramework.TRANSFER_ENCODING);
                        if (hl != null && hl.value.contains("chunked")) {
                            // because chunked stream.read doesn't read source fully, make sure all chunks are read
                            istream = new ChunkedInputStream(istream);
                            final ByteBuffer bbuffer = new ByteBuffer();
                            int c;
                            while ((c = istream.read()) >= 0) {
                                bbuffer.append(c);
                            }
                            content = bbuffer.getBytes();
                        } else {
                            content = new byte[(int) http.getPayloadLength()];
                            istream.read(content, 0, content.length);
                        }
                        istream.close();

                        RequestHeader requestHeader = new RequestHeader();

                        ResponseHeader responseHeader = new ResponseHeader(http.statusCode);
                        for (HeaderLine hx : http.getHeaderList()) { // include all original response headers for parser
                            responseHeader.put(hx.name, hx.value);
                        }

                        final Request request = new Request(
                                null,
                                location,
                                requestHeader.referer() == null ? null : requestHeader.referer().hash(),
                                "warc",
                                responseHeader.lastModified(),
                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile.handle(), // use remote profile (to index text & media, without writing to cache
                                0,
                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile.timezoneOffset());

                        final Response response = new Response(
                                request,
                                requestHeader,
                                responseHeader,
                                Switchboard.getSwitchboard().crawler.defaultRemoteProfile,
                                false,
                                content
                        );

                        Switchboard.getSwitchboard().toIndexer(response);
                        recordCnt++;
                    }
                }
            }
            this.consumed = localwarcReader.getConsumed();
            wrec = localwarcReader.getNextRecord();
        }
        localwarcReader.close();
        ConcurrentLog.info("WarcImporter", "Indexed " + recordCnt + " documents");
        job = null;
    }

    @Override
    public void run() {
        try {
            this.indexWarcRecords(this.source);
        } catch (IOException ex) {
            ConcurrentLog.info("WarcImporter", ex.getMessage());
        }
    }

    /**
     * Filename of the input source
     * @return
     */
    @Override
    public String source() {
        return this.name;
    }

    /**
     * Number of responses (pages) indexed
     * @return
     */
    @Override
    public int count() {
        return this.recordCnt;
    }

    /**
     * Indexed responses per second
     * @return
     */
    @Override
    public int speed() {
        if (this.recordCnt == 0) return 0;
        return (int) (this.recordCnt / Math.max(0L, runningTime() ));
    }

    /**
     * Duration in seconds running, working on the current import source
     * @return duration in seconds
     */
    @Override
    public long runningTime() {
        return (System.currentTimeMillis() - this.startTime) / 1000L;
    }

    /**
     * Estimate on time remaining calculated from length of input source and
     * processed bytes.
     * @return duration in seconds
     */
    @Override
    public long remainingTime() {
        if (this.consumed == 0) {
            return 0;
        } else {
            long speed = this.consumed / runningTime();
            return (this.sourceSize - this.consumed) / speed;
        }
    }

    @Override
    public String status() {
        return "";
    }

}