Crawler.java example

Explorer
loklak_server-master
- src
  - org
    - json
    - loklak
- test
  - org
    - json
      - JSONObjectTest.java
    - loklak
      - data
        ElasticsearchClientTest.java
      - tools
        storage
        JsonDatasetTest.java
        JsonFileTest.java
        JsonMinifierTest.java
        JsonRandomAccessFileTest.java
package org.loklak;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingDeque;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.LinkedBlockingDeque;

import org.json.JSONObject;
import org.loklak.data.DAO;
import org.loklak.objects.MessageEntry;
import org.loklak.objects.Timeline;
import org.loklak.tools.DateParser;

public class Crawler {

    private static class Term {
        public String query;
        public int depth;
        public boolean followHashtags;
        public boolean followUsers;
        
        public Term(String query, int depth, boolean followHashtags, boolean followUsers) {
            this.query = query;
            this.depth = depth;
            this.followHashtags = followHashtags;
            this.followUsers = followUsers;
        }
    }
    
    private static BlockingDeque<Term> pending = new LinkedBlockingDeque<Term>();
    private static Map<String, Long> stacked = new ConcurrentHashMap<String, Long>();
    
    public static boolean stack(String query, int depth, boolean followHashtags, boolean followUsers, boolean atfront) {
        // remove old queries
        if (pending.size() == 0) {
            // remove old entries
            Iterator<Map.Entry<String, Long>> i = stacked.entrySet().iterator();
            long timeout = System.currentTimeMillis() - DateParser.HOUR_MILLIS; // 1 hour: a user rarely posts more than 20 tweets an hour, so this should be sufficient
            while (i.hasNext()) {
                if (i.next().getValue().longValue() < timeout) i.remove();
            }
        }
        if (stacked.containsKey(query)) return false;
        stacked.put(query, System.currentTimeMillis());
        Term nextTerm = new Term(query, Math.max(0, Math.min(4, depth)), followHashtags, followUsers);
        if (atfront) pending.addFirst(nextTerm); else pending.addLast(nextTerm);
        return true;
    }
    
    public static int process() {
        // take a term from the stack
        if (pending.size() == 0) return 0;
        Term term;
        try {
            term = pending.take();
        } catch (InterruptedException e) {
            return 0;
        }
        
        // execute the query
        Timeline tl = DAO.scrapeTwitter(null, term.query, Timeline.Order.CREATED_AT, 0, false, 10000, false);
        
        // if depth of query was 0, terminate
        if (term.depth == 0) return 0;
        
        // take hashtags and users from result
        Set<String> newqueries = new HashSet<String>();
        for (MessageEntry t: tl) {
            // follow users and hashtags which appear in the tweet
            if (term.followUsers) for (String user: t.getMentions()) if (user.length() >= 2) newqueries.add(user);
            if (term.followHashtags) for (String hashtag: t.getHashtags()) if (hashtag.length() >= 2) newqueries.add(hashtag);
            
            // we always follow the users which are the authors of the tweets
            newqueries.add(t.getScreenName());
        }
        
        // put the hashtags and users on the stack with reduced depth
        int count = 0;
        for (String query: newqueries) {
            if (stack(query, term.depth - 1, term.followHashtags, term.followUsers, false)) count++;
        }
        
        // return the number of new terms on the crawl stack
        return count;
    }
    
    public static JSONObject toJSON() {
        ArrayList<String> pendingQueries = new ArrayList<String>();
        Set<String> processedQueries = new HashSet<String>(); processedQueries.addAll(stacked.keySet());
        for (Term t: pending) {pendingQueries.add(t.query); processedQueries.remove(t.query);}
        JSONObject m = new JSONObject(true);
        m.put("pending_size", pending.size());
        m.put("stacked_size", stacked.size());
        m.put("processed_size", processedQueries.size());
        
        m.put("pending", pendingQueries.toArray(new String[pendingQueries.size()]));
        m.put("processed", processedQueries.toArray(new String[processedQueries.size()]));
        return m;
    }
}