package org.loklak; import java.util.ArrayList; import java.util.HashSet; import java.util.Iterator; import java.util.Map; import java.util.Set; import java.util.concurrent.BlockingDeque; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.LinkedBlockingDeque; import org.json.JSONObject; import org.loklak.data.DAO; import org.loklak.objects.MessageEntry; import org.loklak.objects.Timeline; import org.loklak.tools.DateParser; public class Crawler { private static class Term { public String query; public int depth; public boolean followHashtags; public boolean followUsers; public Term(String query, int depth, boolean followHashtags, boolean followUsers) { this.query = query; this.depth = depth; this.followHashtags = followHashtags; this.followUsers = followUsers; } } private static BlockingDeque<Term> pending = new LinkedBlockingDeque<Term>(); private static Map<String, Long> stacked = new ConcurrentHashMap<String, Long>(); public static boolean stack(String query, int depth, boolean followHashtags, boolean followUsers, boolean atfront) { // remove old queries if (pending.size() == 0) { // remove old entries Iterator<Map.Entry<String, Long>> i = stacked.entrySet().iterator(); long timeout = System.currentTimeMillis() - DateParser.HOUR_MILLIS; // 1 hour: a user rarely posts more than 20 tweets an hour, so this should be sufficient while (i.hasNext()) { if (i.next().getValue().longValue() < timeout) i.remove(); } } if (stacked.containsKey(query)) return false; stacked.put(query, System.currentTimeMillis()); Term nextTerm = new Term(query, Math.max(0, Math.min(4, depth)), followHashtags, followUsers); if (atfront) pending.addFirst(nextTerm); else pending.addLast(nextTerm); return true; } public static int process() { // take a term from the stack if (pending.size() == 0) return 0; Term term; try { term = pending.take(); } catch (InterruptedException e) { return 0; } // execute the query Timeline tl = DAO.scrapeTwitter(null, term.query, Timeline.Order.CREATED_AT, 0, false, 10000, false); // if depth of query was 0, terminate if (term.depth == 0) return 0; // take hashtags and users from result Set<String> newqueries = new HashSet<String>(); for (MessageEntry t: tl) { // follow users and hashtags which appear in the tweet if (term.followUsers) for (String user: t.getMentions()) if (user.length() >= 2) newqueries.add(user); if (term.followHashtags) for (String hashtag: t.getHashtags()) if (hashtag.length() >= 2) newqueries.add(hashtag); // we always follow the users which are the authors of the tweets newqueries.add(t.getScreenName()); } // put the hashtags and users on the stack with reduced depth int count = 0; for (String query: newqueries) { if (stack(query, term.depth - 1, term.followHashtags, term.followUsers, false)) count++; } // return the number of new terms on the crawl stack return count; } public static JSONObject toJSON() { ArrayList<String> pendingQueries = new ArrayList<String>(); Set<String> processedQueries = new HashSet<String>(); processedQueries.addAll(stacked.keySet()); for (Term t: pending) {pendingQueries.add(t.query); processedQueries.remove(t.query);} JSONObject m = new JSONObject(true); m.put("pending_size", pending.size()); m.put("stacked_size", stacked.size()); m.put("processed_size", processedQueries.size()); m.put("pending", pendingQueries.toArray(new String[pendingQueries.size()])); m.put("processed", processedQueries.toArray(new String[processedQueries.size()])); return m; } }