/** * DumpImporter * Copyright 06.01.2016 by Michael Peter Christen, @0rb1t3r * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public License * along with this program in the file lgpl21.txt * If not, see <http://www.gnu.org/licenses/>. */ package org.loklak; import java.io.File; import java.io.IOException; import java.util.ArrayList; import java.util.Collection; import java.util.List; import java.util.concurrent.atomic.AtomicLong; import org.eclipse.jetty.util.log.Log; import org.json.JSONObject; import org.loklak.data.DAO; import org.loklak.data.IndexEntry; import org.loklak.objects.MessageEntry; import org.loklak.objects.UserEntry; import org.loklak.tools.storage.JsonFactory; import org.loklak.tools.storage.JsonReader; import org.loklak.tools.storage.JsonStreamReader; public class DumpImporter extends Thread { private boolean shallRun = true, isBusy = false; private int count = Integer.MAX_VALUE; public DumpImporter(int count) { this.count = count; } /** * ask the thread to shut down */ public void shutdown() { this.shallRun = false; this.interrupt(); Log.getLog().info("catched DumpImporter termination signal"); } public boolean isBusy() { return this.isBusy; } @Override public void run() { // work loop loop: while (this.shallRun) try { this.isBusy = false; // scan dump input directory to import files Collection<File> import_dumps = DAO.message_dump.getImportDumps(this.count); // check if we can do anything if (import_dumps == null || import_dumps.size() == 0 || !DAO.wait_ready(Long.MAX_VALUE)) { try {Thread.sleep(10000);} catch (InterruptedException e) {} continue loop; } this.isBusy = true; // take only one file and process this file File import_dump = import_dumps.iterator().next(); final JsonReader dumpReader = DAO.message_dump.getDumpReader(import_dump); final AtomicLong newTweets = new AtomicLong(0); Log.getLog().info("started import of dump file " + import_dump.getAbsolutePath()); // we start concurrent indexing threads to process the json objects Thread[] indexerThreads = new Thread[dumpReader.getConcurrency()]; for (int i = 0; i < dumpReader.getConcurrency(); i++) { indexerThreads[i] = new Thread() { public void run() { JsonFactory tweet; try { List<IndexEntry<UserEntry>> userBulk = new ArrayList<>(); List<IndexEntry<MessageEntry>> messageBulk = new ArrayList<>(); while ((tweet = dumpReader.take()) != JsonStreamReader.POISON_JSON_MAP) { try { JSONObject json = tweet.getJSON(); JSONObject user = (JSONObject) json.remove("user"); if (user == null) continue; UserEntry u = new UserEntry(user); MessageEntry t = new MessageEntry(json); // record user into search index userBulk.add(new IndexEntry<UserEntry>(u.getScreenName(), t.getSourceType(), u)); messageBulk.add(new IndexEntry<MessageEntry>(t.getIdStr(), t.getSourceType(), t)); if (userBulk.size() > 1500 || messageBulk.size() > 1500) { DAO.users.writeEntries(userBulk); DAO.messages.writeEntries(messageBulk); userBulk.clear(); messageBulk.clear(); } newTweets.incrementAndGet(); } catch (IOException e) { Log.getLog().warn(e); } if (LoklakServer.queuedIndexing.isBusy()) try {Thread.sleep(200);} catch (InterruptedException e) {} } try { DAO.users.writeEntries(userBulk); DAO.messages.writeEntries(messageBulk); } catch (IOException e) { Log.getLog().warn(e); } } catch (InterruptedException e) { Log.getLog().warn(e); } } }; indexerThreads[i].start(); } // wait for termination of the indexing threads and do logging meanwhile boolean running = true; while (running) { long startTime = System.currentTimeMillis(); long startCount = newTweets.get(); running = false; for (int i = 0; i < dumpReader.getConcurrency(); i++) { if (indexerThreads[i].isAlive()) running = true; } try {Thread.sleep(10000);} catch (InterruptedException e) {} long runtime = System.currentTimeMillis() - startTime; long count = newTweets.get() - startCount; Log.getLog().info("imported " + newTweets.get() + " tweets at " + (count * 1000 / runtime) + " tweets per second from " + import_dump.getName()); } // catch up the number of processed tweets Log.getLog().info("finished import of dump file " + import_dump.getAbsolutePath() + ", " + newTweets.get() + " new tweets"); // shift the dump file to prevent that it is imported again DAO.message_dump.shiftProcessedDump(import_dump.getName()); this.isBusy = false; } catch (Throwable e) { Log.getLog().warn("DumpImporter THREAD", e); try {Thread.sleep(10000);} catch (InterruptedException e1) {} } Log.getLog().info("DumpImporter terminated"); } }