/** * Twitter Tools * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ package cc.twittertools.download; import java.io.BufferedReader; import java.io.File; import java.io.FileInputStream; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStreamReader; import java.io.OutputStreamWriter; import java.util.Map; import java.util.Timer; import java.util.TimerTask; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.zip.GZIPOutputStream; import org.apache.commons.cli.CommandLine; import org.apache.commons.cli.CommandLineParser; import org.apache.commons.cli.GnuParser; import org.apache.commons.cli.HelpFormatter; import org.apache.commons.cli.OptionBuilder; import org.apache.commons.cli.Options; import org.apache.commons.cli.ParseException; import org.apache.commons.lang.StringEscapeUtils; import org.apache.log4j.Logger; import com.google.common.base.Preconditions; import com.google.gson.Gson; import com.google.gson.JsonObject; import com.google.gson.JsonParser; import com.google.gson.JsonSyntaxException; import com.ning.http.client.AsyncCompletionHandler; import com.ning.http.client.AsyncHttpClient; import com.ning.http.client.AsyncHttpClientConfig; import com.ning.http.client.HttpResponseHeaders; import com.ning.http.client.HttpResponseStatus; import com.ning.http.client.Response; import com.ning.http.client.extra.ThrottleRequestFilter; public class AsyncEmbeddedJsonStatusBlockCrawler { private static final Logger LOG = Logger.getLogger(AsyncEmbeddedJsonStatusBlockCrawler.class); public static final String JSON_START = "<input type=\"hidden\" id=\"init-data\" class=\"json-data\" value=\""; public static final String JSON_END = "\">"; private static final int TWEET_BLOCK_SIZE = 500; private static final int MAX_CONNECTIONS = 100; private static final int CONNECTION_TIMEOUT = 10000; private static final int IDLE_CONNECTION_TIMEOUT = 10000; private static final int REQUEST_TIMEOUT = 10000; private static final int MAX_RETRY_ATTEMPTS = 2; private static final int WAIT_BEFORE_RETRY = 1000; private static final Timer timer = new Timer(true); private static final JsonParser JSON_PARSER = new JsonParser(); private static final Gson GSON = new Gson(); private final File file; private final File output; private final File repair; private final AsyncHttpClient asyncHttpClient; private final boolean noFollow; // key = statud id, value = tweet JSON private final ConcurrentSkipListMap<Long, String> crawl = new ConcurrentSkipListMap<Long, String>(); // key = statud id, value = data line private final ConcurrentSkipListMap<Long, String> crawl_repair = new ConcurrentSkipListMap<Long, String>(); private final AtomicInteger connections = new AtomicInteger(0); public AsyncEmbeddedJsonStatusBlockCrawler(File file, String output, String repair, boolean noFollow) throws IOException { this.file = Preconditions.checkNotNull(file); this.noFollow = noFollow; if (!file.exists()) { throw new IOException(file + " does not exist!"); } // check existence of output's parent directory this.output = new File(Preconditions.checkNotNull(output)); File parent = this.output.getParentFile(); if (parent != null && !parent.exists()) { throw new IOException(output + "'s parent directory does not exist!"); } // check existence of repair's parent directory (or set to null if no // repair file specified) if (repair != null) { this.repair = new File(repair); parent = this.repair.getParentFile(); if (parent != null && !parent.exists()) { throw new IOException(repair + "'s parent directory does not exist!"); } } else { this.repair = null; } AsyncHttpClientConfig config = new AsyncHttpClientConfig.Builder() .addRequestFilter(new ThrottleRequestFilter(MAX_CONNECTIONS)) .setConnectionTimeoutInMs(CONNECTION_TIMEOUT) .setIdleConnectionInPoolTimeoutInMs(IDLE_CONNECTION_TIMEOUT) .setRequestTimeoutInMs(REQUEST_TIMEOUT).setMaxRequestRetry(0).build(); this.asyncHttpClient = new AsyncHttpClient(config); } public static String getUrl(long id, String username) { Preconditions.checkNotNull(username); return String.format("http://twitter.com/%s/status/%d", username, id); } public void fetch() throws IOException { long start = System.currentTimeMillis(); LOG.info("Processing " + file); int cnt = 0; BufferedReader data = null; try { data = new BufferedReader(new InputStreamReader(new FileInputStream(file))); String line; while ((line = data.readLine()) != null) { try { String[] arr = line.split("\t"); long id = Long.parseLong(arr[0]); String username = (arr.length > 1) ? arr[1] : "a"; String url = getUrl(id, username); connections.incrementAndGet(); crawlURL(url, new TweetFetcherHandler(id, username, url, 0, !this.noFollow, line)); cnt++; if (cnt % TWEET_BLOCK_SIZE == 0) { LOG.info(cnt + " requests submitted"); } } catch (NumberFormatException e) { // parseLong continue; } } } catch (IOException e) { e.printStackTrace(); } finally { data.close(); } // Wait for the last requests to complete. LOG.info("Waiting for remaining requests (" + connections.get() + ") to finish!"); for (int i = 0; i < 10; i++) { if (connections.get() == 0) { break; } try { Thread.sleep(1000); } catch (Exception e) { e.printStackTrace(); } } asyncHttpClient.close(); long end = System.currentTimeMillis(); long duration = end - start; LOG.info("Total request submitted: " + cnt); LOG.info(crawl.size() + " tweets fetched in " + duration + "ms"); LOG.info("Writing tweets..."); int written = 0; OutputStreamWriter out = new OutputStreamWriter(new GZIPOutputStream( new FileOutputStream(output)), "UTF-8"); for (Map.Entry<Long, String> entry : crawl.entrySet()) { written++; out.write(entry.getValue() + "\n"); } out.close(); LOG.info(written + " statuses written."); if (this.repair != null) { LOG.info("Writing repair data file..."); written = 0; out = new OutputStreamWriter(new FileOutputStream(repair), "UTF-8"); for (Map.Entry<Long, String> entry : crawl_repair.entrySet()) { written++; out.write(entry.getValue() + "\n"); } out.close(); LOG.info(written + " statuses need repair."); } LOG.info("Done!"); } private class TweetFetcherHandler extends AsyncCompletionHandler<Response> { private final long id; private final String username; private final String url; private final int numRetries; private final boolean followRedirects; private final String line; private int httpStatus = -1; public TweetFetcherHandler(long id, String username, String url, int numRetries, boolean followRedirects, String line) { this.id = id; this.username = username; this.url = url; this.numRetries = numRetries; this.followRedirects = followRedirects; this.line = line; } public long getId() { return id; } public String getLine() { return line; } @Override public STATE onStatusReceived(HttpResponseStatus responseStatus) throws Exception { this.httpStatus = responseStatus.getStatusCode(); switch (this.httpStatus) { case 404: LOG.warn("Abandoning missing page: " + url); connections.decrementAndGet(); return STATE.ABORT; case 500: retry(); return STATE.ABORT; } return super.onStatusReceived(responseStatus); } @Override public STATE onHeadersReceived(HttpResponseHeaders headers) throws Exception { switch (this.httpStatus) { case 301: case 302: String redirect = headers.getHeaders().getFirstValue("Location"); if (redirect.contains("protected_redirect=true")) { LOG.warn("Abandoning protected account: " + url); connections.decrementAndGet(); } else if (redirect.contains("account/suspended")) { LOG.warn("Abandoning suspended account: " + url); connections.decrementAndGet(); } else if (redirect.contains("//status") || redirect.contains("login?redirect_after_login")) { LOG.warn("Abandoning deleted account: " + url); connections.decrementAndGet(); } else if (followRedirects) { crawlURL(redirect, new TweetFetcherHandler(id, username, redirect, numRetries, followRedirects, line)); } else { LOG.warn("Abandoning redirect: " + url); connections.decrementAndGet(); } return STATE.ABORT; } return super.onHeadersReceived(headers); } @Override public Response onCompleted(Response response) { switch (this.httpStatus) { case -1: case 301: case 302: case 404: case 500: return response; } // extract embedded JSON try { String html = response.getResponseBody("UTF-8"); int jsonStart = html.indexOf(JSON_START); int jsonEnd = html.indexOf(JSON_END, jsonStart + JSON_START.length()); if (jsonStart < 0 || jsonEnd < 0) { LOG.warn("Unable to find embedded JSON: " + url); retry(); return response; } String json = html.substring(jsonStart + JSON_START.length(), jsonEnd); json = StringEscapeUtils.unescapeHtml(json); JsonObject page = (JsonObject) JSON_PARSER.parse(json); JsonObject status = page.getAsJsonObject("embedData").getAsJsonObject("status"); // save the requested id status.addProperty("requested_id", new Long(id)); crawl.put(id, GSON.toJson(status)); connections.decrementAndGet(); return response; } catch (IOException e) { LOG.warn("Error (" + e + "): " + url); retry(); return response; } catch (JsonSyntaxException e) { LOG.warn("Unable to parse embedded JSON: " + url); retry(); return response; } catch (NullPointerException e) { LOG.warn("Unexpected format for embedded JSON: " + url); retry(); return response; } } @Override public void onThrowable(Throwable t) { retry(); } private void retry() { if (this.numRetries >= MAX_RETRY_ATTEMPTS) { LOG.warn("Abandoning after max retry attempts: " + url); crawl_repair.put(id, line); connections.decrementAndGet(); return; } timer.schedule(new RetryTask(id, username, url, numRetries + 1, followRedirects), WAIT_BEFORE_RETRY); } private class RetryTask extends TimerTask { private final long id; private final String username; private final String url; private final int numRetries; private final boolean followRedirects; public RetryTask(long id, String username, String url, int numRetries, boolean followRedirects) { this.id = id; this.username = username; this.url = url; this.numRetries = numRetries; this.followRedirects = followRedirects; } public void run() { crawlURL(url, new TweetFetcherHandler(id, username, url, numRetries, followRedirects, line)); } } } private void crawlURL(String url, TweetFetcherHandler handler) { try { asyncHttpClient.prepareGet(url).addHeader("Accept-Charset", "utf-8") .addHeader("Accept-Language", "en-US").execute(handler); } catch (IOException e) { LOG.warn("Abandoning due to error (" + e + "): " + url); crawl_repair.put(handler.getId(), handler.getLine()); connections.decrementAndGet(); } } private static final String DATA_OPTION = "data"; private static final String OUTPUT_OPTION = "output"; private static final String REPAIR_OPTION = "repair"; private static final String NOFOLLOW_OPTION = "noFollow"; @SuppressWarnings("static-access") public static void main(String[] args) throws Exception { Options options = new Options(); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("data file with tweet ids").create(DATA_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("output file (*.gz)").create(OUTPUT_OPTION)); options.addOption(OptionBuilder.withArgName("path").hasArg() .withDescription("output repair file (can be used later as a data file)") .create(REPAIR_OPTION)); options.addOption(NOFOLLOW_OPTION, NOFOLLOW_OPTION, false, "don't follow 301 redirects"); CommandLine cmdline = null; CommandLineParser parser = new GnuParser(); try { cmdline = parser.parse(options, args); } catch (ParseException exp) { System.err.println("Error parsing command line: " + exp.getMessage()); System.exit(-1); } if (!cmdline.hasOption(DATA_OPTION) || !cmdline.hasOption(OUTPUT_OPTION)) { HelpFormatter formatter = new HelpFormatter(); formatter.printHelp(AsyncEmbeddedJsonStatusBlockCrawler.class.getName(), options); System.exit(-1); } String data = cmdline.getOptionValue(DATA_OPTION); String output = cmdline.getOptionValue(OUTPUT_OPTION); String repair = cmdline.getOptionValue(REPAIR_OPTION); boolean noFollow = cmdline.hasOption(NOFOLLOW_OPTION); new AsyncEmbeddedJsonStatusBlockCrawler(new File(data), output, repair, noFollow).fetch(); } }