package com.zillabyte.motherbrain.flow.components.builtin;
import java.net.URI;
import java.util.List;
import java.util.concurrent.CountDownLatch;
import org.apache.log4j.Logger;
import com.ning.http.client.AsyncCompletionHandler;
import com.ning.http.client.AsyncHttpClient;
import com.ning.http.client.AsyncHttpClientConfig;
import com.ning.http.client.AsyncHttpClientConfig.Builder;
import com.ning.http.client.HttpResponseBodyPart;
import com.ning.http.client.Response;
import com.zillabyte.motherbrain.flow.Component;
import com.zillabyte.motherbrain.flow.FlowCompilationException;
import com.zillabyte.motherbrain.flow.MapTuple;
import com.zillabyte.motherbrain.flow.StreamBuilder.ComponentStreamBuilder;
import com.zillabyte.motherbrain.flow.collectors.OutputCollector;
import com.zillabyte.motherbrain.flow.components.ComponentInput;
import com.zillabyte.motherbrain.flow.components.ComponentOutput;
import com.zillabyte.motherbrain.flow.config.FlowConfig;
import com.zillabyte.motherbrain.flow.operations.OperationException;
import com.zillabyte.motherbrain.flow.operations.builtin.Clumper;
import com.zillabyte.motherbrain.relational.ColumnDef;
import com.zillabyte.motherbrain.utils.MeteredLog;
import com.zillabyte.motherbrain.utils.Utils;
public class FetchUrlComponent {
public static class Handler extends Clumper {
private static final long serialVersionUID = 8362569227165803567L;
private transient AsyncHttpClient _asyncHttpClient;
public Handler(String name, int clumpCount) {
super(name, clumpCount);
}
@Override
public void prepare() {
Builder builder = new AsyncHttpClientConfig.Builder();
AsyncHttpClientConfig asyncConfig = builder.setAllowPoolingConnection(true)
.setConnectionTimeoutInMs(HTTP_TIMEOUT)
.setRequestTimeoutInMs(HTTP_TIMEOUT)
.setFollowRedirects(false)
.build();
_asyncHttpClient = new AsyncHttpClient(asyncConfig);
}
@Override
public int getTargetParallelism() {
// TODO: fix this
return 20;
// if (config.containsKey("parallelism")) {
// return Integer.parseInt( config.get("parallelism").toString() );
// } else {
// return super.getTargetParallelism();
// }
}
@Override
public void execute(List<MapTuple> tuples, final OutputCollector collector) throws OperationException {
if (tuples.isEmpty())
return;
final CountDownLatch latch = new CountDownLatch(tuples.size());
try {
for(final MapTuple t : tuples) {
// Prepare the URL
String rawUrl = (String)t.get("url");
if (rawUrl.contains("://") == false) rawUrl = "http://" + rawUrl;
final URI url = new URI(rawUrl);
// Start the requests...
_asyncHttpClient.prepareGet(rawUrl).execute(new AsyncCompletionHandler<Void>(){
private int _size = 0;
@Override
public Void onCompleted(Response response) throws Exception {
try {
// Are we dealing with a non-text type?
if (response.getContentType() != null && response.getContentType().contains("text") == false) {
logger().error("Skipping " + url.toString() + " because it is not text.");
return null;
}
// Success!
MeteredLog.info(logger(), "fetched: " + response.getUri().toString());
MapTuple t = MapTuple
.create("url", url.toString())
.put("content", response.getResponseBody())
.put("code", response.getStatusCode());
if (response.isRedirected()) {
t.put("redirect", url.resolve(response.getHeader("Location")).toString());
}
collector.emit(t);
return null;
} catch(Exception e) {
_log.error("error in fetcher: " + e);
throw new Exception(e);
} finally {
latch.countDown();
}
}
@Override
public void onThrowable(Throwable e){
latch.countDown();
logger().error("Unable to fetch: " + url.toString() + " (" + e.getMessage() + ")");
e.printStackTrace();
}
@Override
public STATE onBodyPartReceived(HttpResponseBodyPart bodyPart) throws Exception {
_size += bodyPart.length();
if (_size > MAX_BODY_SIZE) {
logger().error("Skipping " + url.toString() + " because it is too large.");
return STATE.ABORT;
}
return super.onBodyPartReceived(bodyPart);
}
});
}
// Wait for the above to finish...
latch.await();
} catch (InterruptedException e) {
_log.error("interrupted");
} catch (Exception e) {
_log.error("exception: " + e.getMessage());
}
}
}
protected static final int HTTP_TIMEOUT = 5000;
private static final int MAX_PARALLELISM = 50;
private static final int MAX_BODY_SIZE = 400_000;
private static Logger _log = Utils.getLogger(FetchUrlComponent.class);
public static Component create(final FlowConfig config) throws FlowCompilationException {
Component c = new Component("fetch_url", config);
ComponentStreamBuilder sb = c.createStream(
new ComponentInput(
"input",
ColumnDef.createString("url")
),
"stream");
sb.aggregate(new Handler("fetch", MAX_PARALLELISM));
sb.outputs(new ComponentOutput(
"output",
ColumnDef.createString("url"),
ColumnDef.createInteger("code"),
ColumnDef.createString("content")
));
return c;
}
}