package io.github.infolis.infolink.querying;
import io.github.infolis.util.RegexUtils;
import io.github.infolis.util.URLParamEncoder;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
//import java.net.URLEncoder;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* Class for matching dataset reference strings to entries in the dara dataset
* repository.
*
* @author kata
* @version 2014-01-27
*
*/
public class DaraWebMatcher {
String searchInterface;
String queryCache;
String externalDatasetURLs;
/**
* Class constructor specifying the repository's search interface base URL,
* the query cache file and URL list for external datasets to use.
*
* If no query cache or URL list shall be used, set corresponding parameter
* to null.
*
* If non-existing query cache is specified, one will be created.
*
* @param searchInterface the repository's search interface base URL
* @param queryCache path to the query cache (or null if no cache shall be
* used)
* @param externalDatasetURLs path to the list of URLs for external datasets
* (or null if no URL list shall be used)
*/
DaraWebMatcher(String searchInterface, String queryCache, String externalDatasetURLs) {
this.searchInterface = searchInterface;
this.queryCache = queryCache;
this.externalDatasetURLs = externalDatasetURLs;
}
/**
* Returns the hex representation of the input string <emph>arg</emph>.
*
* @param arg string to be represented in hex value
* @return the hex representation of the input string <emph>arg</emph>
* @throws UnsupportedEncodingException
*/
public String toHex(String arg) throws UnsupportedEncodingException {
return String.format("%x", new BigInteger(1, arg.getBytes("UTF-8")));
}
/**
* Constructs a search URL from the base URL and the query.
*
* @param searchTerm the query term
* @param maxNumber the maximum number of hits to be displayed
* @return the search URL
* @throws MalformedURLException
*/
public URL constructURL(String searchTerm, int maxNumber) throws MalformedURLException {
try {
return new URL(String.format("%s?title=%s&max=%s&lang=de",
this.searchInterface,
// URLEncoder transforms plain text into the application/x-www-form-urlencoded MIME format
// as described in the HTML specification (GET-style URLs or POST forms)
// does not work with the new dara search function
//URLEncoder.encode(searchTerm, "UTF-8"),
URLParamEncoder.encode(searchTerm),
String.valueOf(maxNumber)));
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
return new URL(String.format("%s?title=%s&max=%s&lang=de", this.searchInterface, searchTerm, String.valueOf(maxNumber)));
}
}
/**
* Reads and returns the content from the query response page.
*
* @param url the URL to read from
* @return the contents of the page
* @throws IOException
*/
public String readFromURL(URL url) throws IOException {
URLConnection connection = url.openConnection();
connection.setRequestProperty("Accept-Charset", "UTF-8");
//connection.setConnectTimeout(6000);
System.out.println("Reading from url...");
// make sure that all data is read
byte[] resultBuff = new byte[0];
byte[] buff = new byte[1024];
int k = -1;
while ((k = connection.getInputStream().read(buff, 0, buff.length)) > -1) {
byte[] tbuff = new byte[resultBuff.length + k]; // temp buffer size = bytes already read + bytes last read
System.arraycopy(resultBuff, 0, tbuff, 0, resultBuff.length); // copy previous bytes
System.arraycopy(buff, 0, tbuff, resultBuff.length, k); // copy current lot
resultBuff = tbuff; // call the temp buffer as your result buff
}
System.out.println(resultBuff.length + " bytes read.");
String content = new String(resultBuff);
System.out.println("Done reading from url.");
return content;
}
/**
* Parses the HTML output of the dara search function and returns a map with
* dataset DOIs (keys) and names (values).
*
* @param html dara HTML output
* @return a map containing dataset DOIs (keys) and dataset names (values)
*/
public Map<String, String> parseHTML(String html) {
Map<String, String> matchingStudyMap = new HashMap<String, String>();
Document doc = Jsoup.parseBodyFragment(html);
Elements hitlist = doc.getElementsByTag("li");
for (Element hit : hitlist) {
String studyName = "";
String studyDoi = "";
//TODO: search for tag "a" first to limit elements to search by attribute value?
Elements names = hit.getElementsByAttributeValueMatching("href", "/dara/study/web_show?.*");
Elements dois = hit.getElementsByAttributeValueContaining("href", "http://dx.doi.org");
// each entry has exactly one name and one doi element
//TODO: except for some datasets that are not registered but only referenced in dara!
// e.g. "OECD Employment Outlook" -> no doi listed here -> ignored
for (Element name : names) {
studyName = name.text().trim();
}
for (Element doi : dois) {
studyDoi = doi.text().trim();
}
if (studyName != "") {
if (studyDoi != "") {
System.out.println("name: " + studyName);
System.out.println("doi: " + studyDoi);
matchingStudyMap.put(studyDoi, studyName);
}
}
}
return matchingStudyMap;
}
/**
* Reads this queryCache to find DOIs and names of previously queried
* dataset names.
*
* @param url the query to find the dataset entries
* @return a map containing dataset DOIs (key) and names (value)
*/
public Map<String, String> readFromCache(String url) {
try {
File f = new File(this.queryCache);
InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "UTF-8");
BufferedReader reader = new BufferedReader(isr);
String text = null;
while ((text = reader.readLine()) != null) {
if (text.contains(url.toString())) {
Map<String, String> res = new HashMap<String, String>();
String[] data = text.split("--@--");
// query is in cache but no data can be found in dara - return empty hashmap
if (data.length < 3) {
res.put("", "");
reader.close();
isr.close();
return res;
}
// each query has n dataset names with n dois
// data[0] = the query, therefore start at index 1
// end at data.length -2 because data[i+1] is accessed in each iteration
for (int i = 1; i < data.length - 1; i += 2) {
// every first entry of pair: study name
// every second entry of pair: study doi
res.put(data[i + 1], data[i]);
}
reader.close();
return res;
}
}
reader.close();
return new HashMap<String, String>();
} catch (IOException e) {
e.printStackTrace();
return new HashMap<String, String>();
}
}
/**
* Searches for matching (= similar to <emph>studyname</emph>) dataset names
* in this
* <emph>externalDatasetURLs</emph> listing datasets along with URLs to
* their landing pages.
*
* @param studyname name of the dataset to be matched
* @return string representation of a URL pointing to the matching dataset
* record
*/
public String match_external(String studyname) {
System.out.println(studyname);
String link = null;
if (this.externalDatasetURLs == null) {
return link;
}
try {
File f = new File(this.externalDatasetURLs);
InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "UTF-8");
BufferedReader reader = new BufferedReader(isr);
String text = null;
while ((text = reader.readLine()) != null) {
String[] nameUrl = text.split(";");
// studyname might contain additional info, e.g. year specifications
// therefore search for listed title inside of studyname instead of checking whether both are equal
if (studyname.contains(nameUrl[0])) {
reader.close();
return nameUrl[1];
}
}
reader.close();
} catch (IOException e) {
e.printStackTrace();
}
return link;
}
/**
* Splits <emph>searchTerm</emph> into several terms if enumeration markers
* are present.
*
* @param searchTerm string that might represent an enumeration of different
* terms
* @return array of terms if enumeration markers are present or empty array
* otherwise
*/
String[] getEnumeratedTerms(String searchTerm) {
for (String enumerator : RegexUtils.enumeratorList) {
String[] newTerms = searchTerm.split(enumerator);
if (newTerms.length > 1) {
return newTerms;
}
}
return new String[0];
}
/**
* Matches the assumed dataset name <emph>searchTerm</emph> to records in
* dara having a similar name. The computation of string similarity is done
* by dara's search function. Querying dara is carried out using the dara
* web interface to ensure accessibility from outside of GESIS. However,
* this leads to high processing times both for querying and parsing the
* results. If specified in StudyMatcher instance, query cache and / or
* interal URL list will be used for matching.
*
* @param searchTerm assumed dataset name to be matched to dara records
* @return a map containing matching dataset DOIs (keys) and names (values)
*/
public Map<String, String> match(String searchTerm) {
URL url;
try {
url = constructURL(searchTerm, 600);
System.out.println(url);
System.out.println("\n" + searchTerm);
} catch (MalformedURLException e) {
e.printStackTrace();
return new HashMap<String, String>();
}
Map<String, String> res = new HashMap<String, String>();
// read file queryCache - use saved results instead of querying
if (this.queryCache != null) {
res = readFromCache(url.toString());
// query was found in cache
if (!res.isEmpty()) {
System.out.println("Found query in cache for term: " + searchTerm);
// query was in cache but no data was specified i.e. study is not registered in dara
if (res.keySet().contains("")) {
return new HashMap<String, String>();
}
System.out.println(res.toString());
return res;
}
}
// if no cache is used or entry is not found in the cache, query dara
try {
res = parseHTML(readFromURL(url));
} catch (IOException ioe) {
ioe.printStackTrace();
return new HashMap<String, String>();
}
// if result is empty, check if studytitle maybe is an enumeration and search for parts!
if (res.isEmpty()) {
String[] newTerms = getEnumeratedTerms(searchTerm);
for (String term : newTerms) {
// ignore terms consisting of digits only
if (!term.trim().matches("\\d+\\s*")) {
res.putAll(match(term.trim()));
}
}
}
// write results to cache
// empty results in the cache are valuable too -> prevents repeated searching for non-registered studies
if (this.queryCache != null) {
writeToCache(url.toString(), res);
}
return res;
}
/**
* Writes the results of a dara query to the cache found in specified
* cacheFilename path.
*
* @param url the dara query url
* @param res the parsed dara response for the specified query url
* @param cacheFilename path of the cache file
*/
private void writeToCache(String url, Map<String, String> res) {
String delimiter = "--@--";
String newLine = url;
for (String key : res.keySet()) {
newLine = newLine + delimiter + res.get(key) + delimiter + key;
}
try {
System.out.println("Writing query to cache: " + newLine);
File f = new File(this.queryCache);
OutputStreamWriter fstream = new OutputStreamWriter(new FileOutputStream(f, true), "UTF-8");
BufferedWriter out = new BufferedWriter(fstream);
out.write(newLine + System.getProperty("line.separator"));
out.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* Queries "http://www.da-ra.de/dara/study/web_search_show" for the
* specified dataset name. Optionally uses cache file and external dataset
* URL list if specified.
*
* @param args args[0]: dataset name(s); args[1]: path of cache file or
* null; args[2]: path of URL list or null
*/
/*
public static void main(String[] args)
{
if (args.length == 0) {
System.out.println("Usage: StudyMatcher <datasetName> [<cacheFile>] [<urlListFile>]");
System.out.println(" <datasetName> dataset name");
System.out.println(" <cacheFile> path of cache file");
System.out.println(" <urlListFile> name of file containing URL list");
System.exit(1);
}
if (args.length == 3) { StudyMatcher matcher = new StudyMatcher("http://www.da-ra.de/dara/study/web_search_show", args[1], args[2]); System.out.println(matcher.match(args[0]));}
if (args.length == 2) { StudyMatcher matcher = new StudyMatcher("http://www.da-ra.de/dara/study/web_search_show", args[1], null); System.out.println(matcher.match(args[0]));}
if (args.length == 1) { StudyMatcher matcher = new StudyMatcher("http://www.da-ra.de/dara/study/web_search_show", null, null); System.out.println(matcher.match(args[0]));}
}*/
}