package org.wikibrain.download;
import java.io.IOException;
import com.google.common.collect.HashMultimap;
import com.google.common.collect.Multimap;
import org.apache.commons.io.IOUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.wikibrain.core.cmd.FileMatcher;
import org.wikibrain.core.lang.Language;
import java.net.URL;
import java.util.*;
/**
*
* @author Yulun Li
*
* Parses a command line script and generates a .tsv file with the links to the dumps
* of specified file type and languages.
*
*/
public class DumpLinkGetter {
protected static final String BASEURL_STRING = "https://dumps.wikimedia.org";
private Language lang;
private List<FileMatcher> matchers;
private String dumpDate; // This is the date of the dump.
public DumpLinkGetter(Language lang, List<FileMatcher> matchers, String dumpDate) {
this.lang = lang;
this.matchers = matchers;
this.dumpDate = dumpDate;
}
/**
* Get the URL of the index wiki page of a specified language.
* @return
*/
protected String getLanguageWikiUrl() {
// langCode with dashes like "roa-tara" should be 'roa_tara' in dump links
return BASEURL_STRING + "/" + lang.getLangCode().replace("-", "_") + "wiki/";
}
/**
* Get file links that are marked "done" (plus MD5sum) on a dump page of the specified language and specified dumpDate
* @return
* @throws IOException
*/
public List<String> getFileLinks() throws IOException {
List<String> links = new ArrayList<String>();
URL dumpPageUrl = new URL(getLanguageWikiUrl() + dumpDate + "/");
String html = IOUtils.toString(dumpPageUrl.openStream());
if (!html.contains("Dump complete")) {
return links;
}
Document doc = Jsoup.parse(html);
Elements linkElements = doc.select("ul").select("li.done").select("li.file").select("a[href]");
linkElements.addAll(doc.select("p.checksum").select("a[href]"));
for (Element linkElement : linkElements) {
links.add(linkElement.attr("href"));
}
return links;
}
/**
* Return all links of a particular language the fits one of the patterns
* @return hashmap with dump urls and names of dump type
*/
public Multimap<FileMatcher, DumpLinkInfo> getDumpFiles(List<String> links) throws IOException {
Multimap<FileMatcher, DumpLinkInfo> dumpLinks = HashMultimap.create();
Map<String, String> md5s = getMd5Sums(links);
for(FileMatcher linkMatcher : matchers){
List<String> results = linkMatcher.match(links);
if (!results.isEmpty()) {
for (String url : results){
URL linkURL = new URL(BASEURL_STRING + url);
DumpLinkInfo linkInfo = new DumpLinkInfo(lang, dumpDate, linkMatcher, linkURL, linkMatcher.getNumber(url));
linkInfo.setMd5(md5s.get(linkInfo.getDownloadName()));
dumpLinks.put(linkMatcher, linkInfo);
}
}
}
return dumpLinks;
}
/**
* Get MD5 of the dump of the specified language and dumpDate.
* Maps download name to MD5 sum.
* @param links
* @return
* @throws IOException
*/
protected Map<String, String> getMd5Sums(List<String> links) throws IOException {
HashMap<String, String> md5s = new HashMap<String, String>();
if (links.isEmpty()) {
return md5s;
}
FileMatcher md5Matcher = FileMatcher.MD5;
URL md5Url = new URL(BASEURL_STRING + md5Matcher.match(links).get(0));
List<String> lines = IOUtils.readLines(md5Url.openStream(), "UTF-8");
for (String line : lines) {
String[] parsedInfo = line.split("\\W{2}");
String md5 = parsedInfo[0];
String fileName = parsedInfo[1];
md5s.put(fileName, md5);
}
return md5s;
}
public static void main(String[] args) throws IOException {
DumpLinkGetter testGetter = new DumpLinkGetter(Language.getByLangCode("en"), Arrays.asList(FileMatcher.ARTICLES), "20130604");
// System.out.println(testGetter.getMd5Sums(testGetter.getFileLinks()));
System.out.println(testGetter.getDumpFiles(testGetter.getFileLinks()));
}
}