package gr.iti.mklab.download;
import gr.iti.mklab.visual.mapreduce.HadoopImageDownload;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.util.concurrent.Callable;
import java.util.concurrent.CompletionService;
import java.util.concurrent.ExecutorCompletionService;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.Future;
import java.util.concurrent.TimeUnit;
/**
* This class implements multi-threaded image downloading.
*
* @author Eleftherios Spyromitros-Xioufis
*
*/
public class ImageDownloader {
private ExecutorService downloadExecutor;
private CompletionService<ImageDownloadResult> pool;
/** The current number of tasks whose termination is pending. **/
private int numPendingTasks;
/**
* The maximum allowable number of pending tasks, used to limit the memory usage.
*/
private final int maxNumPendingTasks;
/**
* The folder where the original image and/or its thumbnail should be saved.
**/
private String downloadFolder;
/**
* Whether the original image should be saved.
*/
private boolean saveOriginal;
/**
* Whether a thumb of the original image should be saved.
*/
private boolean saveThumb;
/**
* Whether redirects should be followed.
*/
private boolean followRedirects;
/**
* Constructor of the multi-threaded download class.
*
* @param numThreads
* the number of download threads to use
* @param downloadFolder
* the download folder
*/
public ImageDownloader(String downloadFolder, int numThreads) {
this.downloadFolder = downloadFolder;
saveOriginal = false;
saveThumb = true;
followRedirects = false;
downloadExecutor = Executors.newFixedThreadPool(numThreads);
pool = new ExecutorCompletionService<ImageDownloadResult>(downloadExecutor);
numPendingTasks = 0;
maxNumPendingTasks = numThreads * 10;
}
/**
* Submits a new image download task.
*
* @param URL
* The url of the image
* @param id
* The id of the image (used to name the image file after download)
*/
public void submitImageDownloadTask(String URL, String id) {
Callable<ImageDownloadResult> call = new ImageDownload(URL, id, downloadFolder, saveThumb,
saveOriginal, followRedirects);
pool.submit(call);
numPendingTasks++;
}
/**
* Submits a new hadoop image download task.
*
* @param URL
* The url of the image
* @param id
* The id of the image (used to name the image file after download)
*/
public void submitHadoopDownloadTask(String URL, String id) {
Callable<ImageDownloadResult> call = new HadoopImageDownload(URL, id, followRedirects);
pool.submit(call);
numPendingTasks++;
}
/**
* Gets an image download results from the pool.
*
* @return the download result, or null in no results are ready
* @throws Exception
* for a failed download task
*/
public ImageDownloadResult getImageDownloadResult() throws Exception {
Future<ImageDownloadResult> future = pool.poll();
if (future == null) { // no completed tasks in the pool
return null;
} else {
try {
ImageDownloadResult imdr = future.get();
return imdr;
} catch (Exception e) {
throw e;
} finally {
// in any case (Exception or not) the numPendingTask should be reduced
numPendingTasks--;
}
}
}
/**
* Gets an image download result from the pool, waiting if necessary.
*
* @return the download result
* @throws Exception
* for a failed download task
*/
public ImageDownloadResult getImageDownloadResultWait() throws Exception {
try {
ImageDownloadResult imdr = pool.take().get();
return imdr;
} catch (Exception e) {
throw e;
} finally {
// in any case (Exception or not) the numPendingTask should be reduced
numPendingTasks--;
}
}
/**
* Returns true if the number of pending tasks is smaller than the maximum allowable number.
*
* @return
*/
public boolean canAcceptMoreTasks() {
if (numPendingTasks < maxNumPendingTasks) {
return true;
} else {
return false;
}
}
public void setFollowRedirects(boolean followRedirects) {
this.followRedirects = followRedirects;
}
public void setSaveOriginal(boolean saveOriginal) {
this.saveOriginal = saveOriginal;
}
public void setSaveThumb(boolean saveThumb) {
this.saveThumb = saveThumb;
}
/**
* Shuts the download executor down, waiting for up to 60 seconds for the remaining tasks to complete. See
* http://docs.oracle.com/javase/7/docs/api/java/util/concurrent/ExecutorService.html
*
*/
public void shutDown() {
downloadExecutor.shutdown(); // Disable new tasks from being submitted
try {
// Wait a while for existing tasks to terminate
if (!downloadExecutor.awaitTermination(60, TimeUnit.SECONDS)) {
downloadExecutor.shutdownNow(); // Cancel currently executing tasks
// Wait a while for tasks to respond to being cancelled
if (!downloadExecutor.awaitTermination(60, TimeUnit.SECONDS))
System.err.println("Pool did not terminate");
}
} catch (InterruptedException ie) {
// (Re-)Cancel if current thread also interrupted
downloadExecutor.shutdownNow();
// Preserve interrupt status
Thread.currentThread().interrupt();
}
}
/**
* This method exemplifies multi-threaded image download from a list of urls. It uses 5 download threads.
*
* @param dowloadFolder
* Full path to the folder where the images are downloaded
* @param urlsFile
* Full path to the file that contains the ids and urls (space separated) of the images (one
* per line)
* @param numUrls
* The total number of urls to consider
* @param urlsToSkip
* How many urls (from the top of the file to be skipped)
* @throws Exception
*/
public static void downloadFromUrlsFile(String dowloadFolder, String urlsFile, int numUrls, int urlsToSkip)
throws Exception {
long start = System.currentTimeMillis();
int numThreads = 10;
BufferedReader in = new BufferedReader(new FileReader(new File(urlsFile)));
for (int i = 0; i < urlsToSkip; i++) {
in.readLine();
}
ImageDownloader downloader = new ImageDownloader(dowloadFolder, numThreads);
int submittedCounter = 0;
int completedCounter = 0;
int failedCounter = 0;
String line = "";
while (true) {
String url;
String id = "";
// if there are more task to submit and the downloader can accept more tasks then submit
while (submittedCounter < numUrls && downloader.canAcceptMoreTasks()) {
line = in.readLine();
url = line.split("\\s+")[1];
id = line.split("\\s+")[0];
downloader.submitImageDownloadTask(url, id);
submittedCounter++;
}
// if are submitted taks that are pending completion ,try to consume
if (completedCounter + failedCounter < submittedCounter) {
try {
downloader.getImageDownloadResultWait();
completedCounter++;
System.out.println(completedCounter + " downloads completed!");
} catch (Exception e) {
failedCounter++;
System.out.println(failedCounter + " downloads failed!");
System.out.println(e.getMessage());
}
}
// if all tasks have been consumed then break;
if (completedCounter + failedCounter == numUrls) {
downloader.shutDown();
in.close();
break;
}
}
long end = System.currentTimeMillis();
System.out.println("Total time: " + (end - start) + " ms");
System.out.println("Downloaded images: " + completedCounter);
System.out.println("Failed images: " + failedCounter);
}
/**
* Calls the downloadFromUrlsFile.
*
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
String dowloadFolder = "images/";
String urlsFile = "urls.txt";
int numUrls = 1000;
int urlsToSkip = 0;
downloadFromUrlsFile(dowloadFolder, urlsFile, numUrls, urlsToSkip);
}
}