DaraWebMatcher.java example

Explorer

infoLink-master
- src
  - main
    - java
      - io
        github
        infolis
        InfolisConfig.java
        algorithm
        Algorithm.java
        BaseAlgorithm.java
        BestMatchLinker.java
        BibliographyExtractor.java
        Bootstrapping.java
        ComplexAlgorithm.java
        DoiExtractor.java
        DoiLinker.java
        FederatedSearcher.java
        FrequencyBasedBootstrapping.java
        GoldLinker.java
        IllegalAlgorithmArgumentException.java
        Indexer.java
        InfolisPatternSearcher.java
        KeywordTagger.java
        LearnPatternsAndCreateLinks.java
        LinkImporter.java
        LocalSearcher.java
        LuceneSearcher.java
        MetaDataExtractor.java
        MultiMatchesLinker.java
        OntologyLinker.java
        ReferenceLinker.java
        RegexSearcher.java
        ReliabilityBasedBootstrapping.java
        SearchDoisAndCreateLinks.java
        SearchPatternsAndCreateLinks.java
        SearchResultLinker.java
        SpringerImporter.java
        TagSearcher.java
        TextAndMetaDataExtractor.java
        TextExtractor.java
        Tokenizer.java
        TokenizerOpenNLP.java
        TokenizerStanford.java
        commandLine
        CommandLineExecuter.java
        datastore
        AbstractClient.java
        CentralClient.java
        CentralFileResolver.java
        CentralFileResolverTest.java
        DataStoreClient.java
        DataStoreClientFactory.java
        DataStoreStrategy.java
        FileResolver.java
        FileResolverFactory.java
        LocalClient.java
        LocalFileResolver.java
        OutputFileResolver.java
        TempFileResolver.java
        infolink
        annotations
        Annotation.java
        AnnotationHandler.java
        WebAnnoTsvHandler.java
        lucene
        ContextHighlighter.java
        patternLearner
        BootstrapLearner.java
        Reliability.java
        StandardPatternInducer.java
        querying
        DaraHTMLQueryService.java
        DaraSolrMatcher.java
        DaraSolrQueryService.java
        DaraWebMatcher.java
        DataciteQueryService.java
        QueryService.java
        SearchResultScorer.java
        tagger
        Tagger.java
        model
        BaseModel.java
        BootstrapStrategy.java
        Chunk.java
        EntityType.java
        ErrorResponse.java
        Execution.java
        ExecutionStatus.java
        ExtractionMethod.java
        MetaDataExtractingStrategy.java
        SearchQuery.java
        StudyType.java
        TaggedWord.java
        TextualReference.java
        entity
        Entity.java
        EntityLink.java
        InfolisFile.java
        InfolisPattern.java
        Keyword.java
        SearchResult.java
        scheduler
        ExecutionScheduler.java
        util
        EvaluationUtils.java
        InfolisFileUtils.java
        InformationExtractor.java
        LimitedTimeMatcher.java
        MathUtils.java
        RegexUtils.java
        SerializationUtils.java
        TextCleaningUtils.java
        URLParamEncoder.java
        ws
        server
        ExecutorWebservice.java
        InfolisApplication.java
        UploadWebservice.java
  - test
    - java
      - io
        github
        infolis
        InfolisBaseTest.java
        InfolisConfigTest.java
        algorithm
        BibliographyExtractorTest.java
        BootstrappingTest.java
        CitationMinerTest.java
        DoiExtractorTest.java
        DumpAlgo.java
        ExampleChecker.java
        FederatedSearcherTest.java
        FrequencyBasedBootstrappingTest.java
        GoldLinkerTest.java
        IndexerTest.java
        InfolisPatternSearcherTest.java
        KeywordTaggerTest.java
        LearnPatternsAndCreateLinksTest.java
        LinkImporterTest.java
        LuceneSearcherTest.java
        MetaDataExtractorTest.java
        OntologyLinkerTest.java
        ProgressUpdates.java
        ReferenceLinkerTest.java
        RegexSearcherTest.java
        ReliabilityBasedBootstrappingTest.java
        SearchDoisAndCreateLinksTest.java
        SearchPatternsAndCreateLinksTest.java
        SpringerImporterTest.java
        TagSearcherTest.java
        TextAndMetaDataExtractorTest.java
        TextExtractorTest.java
        TokenizerTest.java
        commandLine
        CommandLineExecuterTest.java
        datastore
        CentralClientTest.java
        LocalClientTest.java
        infolink
        annotations
        WebAnnoTsvHandlerTest.java
        patternLearner
        ReliabilityTest.java
        StandardPatternInducerTest.java
        querying
        DaraHTMLQueryServiceTest.java
        DaraSolrMatcherTest.java
        DaraSolrQueryServiceTest.java
        DataciteQueryServiceTest.java
        QueryServiceTest.java
        SearchResultScorerTest.java
        model
        ExecutionTest.java
        InfolisPatternTest.java
        TextualReferenceTest.java
        entity
        EntityLinkTest.java
        scheduler
        ExecutionSchedulerTest.java
        util
        InformationExtractorTest.java
        LimitedTimeMatcherTest.java
        MathUtilsTest.java
        RegexUtilsTest.java
        ws
        server
        ExecutorWebserviceTest.java

package io.github.infolis.infolink.querying;

import io.github.infolis.util.RegexUtils;
import io.github.infolis.util.URLParamEncoder;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.io.UnsupportedEncodingException;
import java.math.BigInteger;
//import java.net.URLEncoder;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.HashMap;
import java.util.Map;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * Class for matching dataset reference strings to entries in the dara dataset
 * repository.
 *
 * @author kata
 * @version 2014-01-27
 *
 */
public class DaraWebMatcher {

    String searchInterface;
    String queryCache;
    String externalDatasetURLs;

    /**
     * Class constructor specifying the repository's search interface base URL,
     * the query cache file and URL list for external datasets to use.
     *
     * If no query cache or URL list shall be used, set corresponding parameter
     * to null.
     *
     * If non-existing query cache is specified, one will be created.
     *
     * @param searchInterface	the repository's search interface base URL
     * @param queryCache	path to the query cache (or null if no cache shall be
     * used)
     * @param externalDatasetURLs	path to the list of URLs for external datasets
     * (or null if no URL list shall be used)
     */
    DaraWebMatcher(String searchInterface, String queryCache, String externalDatasetURLs) {
        this.searchInterface = searchInterface;
        this.queryCache = queryCache;
        this.externalDatasetURLs = externalDatasetURLs;
    }

    /**
     * Returns the hex representation of the input string <emph>arg</emph>.
     *
     * @param arg	string to be represented in hex value
     * @return	the hex representation of the input string <emph>arg</emph>
     * @throws UnsupportedEncodingException
     */
    public String toHex(String arg) throws UnsupportedEncodingException {
        return String.format("%x", new BigInteger(1, arg.getBytes("UTF-8")));
    }

    /**
     * Constructs a search URL from the base URL and the query.
     *
     * @param searchTerm	the query term
     * @param maxNumber	the maximum number of hits to be displayed
     * @return	the search URL
     * @throws MalformedURLException
     */
    public URL constructURL(String searchTerm, int maxNumber) throws MalformedURLException {
        try {
            return new URL(String.format("%s?title=%s&max=%s&lang=de",
                    this.searchInterface,
                    // URLEncoder transforms plain text into the	application/x-www-form-urlencoded MIME format
                    // as described in the HTML specification (GET-style URLs or POST forms)
                    // does not work with the new dara search function
                    //URLEncoder.encode(searchTerm, "UTF-8"),
                    URLParamEncoder.encode(searchTerm),
                    String.valueOf(maxNumber)));
        } catch (UnsupportedEncodingException e) {
            e.printStackTrace();
            return new URL(String.format("%s?title=%s&max=%s&lang=de", this.searchInterface, searchTerm, String.valueOf(maxNumber)));
        }
    }

    /**
     * Reads and returns the content from the query response page.
     *
     * @param url	the URL to read from
     * @return	the contents of the page
     * @throws IOException
     */
    public String readFromURL(URL url) throws IOException {
        URLConnection connection = url.openConnection();
        connection.setRequestProperty("Accept-Charset", "UTF-8");
        //connection.setConnectTimeout(6000);
        System.out.println("Reading from url...");
        // make sure that all data is read
        byte[] resultBuff = new byte[0];
        byte[] buff = new byte[1024];
        int k = -1;
        while ((k = connection.getInputStream().read(buff, 0, buff.length)) > -1) {
            byte[] tbuff = new byte[resultBuff.length + k]; // temp buffer size = bytes already read + bytes last read
            System.arraycopy(resultBuff, 0, tbuff, 0, resultBuff.length); // copy previous bytes
            System.arraycopy(buff, 0, tbuff, resultBuff.length, k);  // copy current lot
            resultBuff = tbuff; // call the temp buffer as your result buff
        }
        System.out.println(resultBuff.length + " bytes read.");
        String content = new String(resultBuff);
        System.out.println("Done reading from url.");
        return content;
    }

    /**
     * Parses the HTML output of the dara search function and returns a map with
     * dataset DOIs (keys) and names (values).
     *
     * @param html	dara HTML output
     * @return	a map containing dataset DOIs (keys) and dataset names (values)
     */
    public Map<String, String> parseHTML(String html) {
        Map<String, String> matchingStudyMap = new HashMap<String, String>();
        Document doc = Jsoup.parseBodyFragment(html);
        Elements hitlist = doc.getElementsByTag("li");
        for (Element hit : hitlist) {
            String studyName = "";
            String studyDoi = "";
            //TODO: search for tag "a" first to limit elements to search by attribute value?
            Elements names = hit.getElementsByAttributeValueMatching("href", "/dara/study/web_show?.*");
            Elements dois = hit.getElementsByAttributeValueContaining("href", "http://dx.doi.org");
			// each entry has exactly one name and one doi element
            //TODO: except for some datasets that are not registered but only referenced in dara!
            // e.g. "OECD Employment Outlook" -> no doi listed here -> ignored
            for (Element name : names) {
                studyName = name.text().trim();
            }
            for (Element doi : dois) {
                studyDoi = doi.text().trim();
            }
            if (studyName != "") {
                if (studyDoi != "") {
                    System.out.println("name: " + studyName);
                    System.out.println("doi: " + studyDoi);
                    matchingStudyMap.put(studyDoi, studyName);
                }
            }
        }
        return matchingStudyMap;
    }

    /**
     * Reads this queryCache to find DOIs and names of previously queried
     * dataset names.
     *
     * @param url	the query to find the dataset entries
     * @return	a map containing dataset DOIs (key) and names (value)
     */
    public Map<String, String> readFromCache(String url) {
        try {
            File f = new File(this.queryCache);
            InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "UTF-8");
            BufferedReader reader = new BufferedReader(isr);
            String text = null;
            while ((text = reader.readLine()) != null) {
                if (text.contains(url.toString())) {
                    Map<String, String> res = new HashMap<String, String>();
                    String[] data = text.split("--@--");
                    // query is in cache but no data can be found in dara - return empty hashmap
                    if (data.length < 3) {
                        res.put("", "");
                        reader.close();
                        isr.close();
                        return res;
                    }
    	    		// each query has n dataset names with n dois
                    // data[0] = the query, therefore start at index 1
                    // end at data.length -2 because data[i+1] is accessed in each iteration
                    for (int i = 1; i < data.length - 1; i += 2) {
    	    			// every first entry of pair: study name
                        // every second entry of pair: study doi
                        res.put(data[i + 1], data[i]);
                    }
                    reader.close();
                    return res;
                }
            }
            reader.close();
            return new HashMap<String, String>();
        } catch (IOException e) {
            e.printStackTrace();
            return new HashMap<String, String>();
        }
    }

    /**
     * Searches for matching (= similar to <emph>studyname</emph>) dataset names
     * in this
     * <emph>externalDatasetURLs</emph> listing datasets along with URLs to
     * their landing pages.
     *
     * @param studyname	name of the dataset to be matched
     * @return	string representation of a URL pointing to the matching dataset
     * record
     */
    public String match_external(String studyname) {
        System.out.println(studyname);
        String link = null;
        if (this.externalDatasetURLs == null) {
            return link;
        }
        try {
            File f = new File(this.externalDatasetURLs);
            InputStreamReader isr = new InputStreamReader(new FileInputStream(f), "UTF-8");
            BufferedReader reader = new BufferedReader(isr);
            String text = null;
            while ((text = reader.readLine()) != null) {
                String[] nameUrl = text.split(";");
    	    	// studyname might contain additional info, e.g. year specifications
                // therefore search for listed title inside of studyname instead of checking whether both are equal
                if (studyname.contains(nameUrl[0])) {
                    reader.close();
                    return nameUrl[1];
                }
            }
            reader.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
        return link;
    }

    /**
     * Splits <emph>searchTerm</emph> into several terms if enumeration markers
     * are present.
     *
     * @param searchTerm	string that might represent an enumeration of different
     * terms
     * @return	array of terms if enumeration markers are present or empty array
     * otherwise
     */
    String[] getEnumeratedTerms(String searchTerm) {
        for (String enumerator : RegexUtils.enumeratorList) {
            String[] newTerms = searchTerm.split(enumerator);
            if (newTerms.length > 1) {
                return newTerms;
            }
        }
        return new String[0];
    }

    /**
     * Matches the assumed dataset name <emph>searchTerm</emph> to records in
     * dara having a similar name. The computation of string similarity is done
     * by dara's search function. Querying dara is carried out using the dara
     * web interface to ensure accessibility from outside of GESIS. However,
     * this leads to high processing times both for querying and parsing the
     * results. If specified in StudyMatcher instance, query cache and / or
     * interal URL list will be used for matching.
     *
     * @param searchTerm	assumed dataset name to be matched to dara records
     * @return	a map containing matching dataset DOIs (keys) and names (values)
     */
    public Map<String, String> match(String searchTerm) {
        URL url;
        try {
            url = constructURL(searchTerm, 600);
            System.out.println(url);
            System.out.println("\n" + searchTerm);
        } catch (MalformedURLException e) {
            e.printStackTrace();
            return new HashMap<String, String>();
        }
        Map<String, String> res = new HashMap<String, String>();
        // read file queryCache - use saved results instead of querying
        if (this.queryCache != null) {
            res = readFromCache(url.toString());
            // query was found in cache
            if (!res.isEmpty()) {
                System.out.println("Found query in cache for term: " + searchTerm);
                // query was in cache but no data was specified i.e. study is not registered in dara
                if (res.keySet().contains("")) {
                    return new HashMap<String, String>();
                }
                System.out.println(res.toString());
                return res;
            }
        }
        // if no cache is used or entry is not found in the cache, query dara
        try {
            res = parseHTML(readFromURL(url));
        } catch (IOException ioe) {
            ioe.printStackTrace();
            return new HashMap<String, String>();
        }

        // if result is empty, check if studytitle maybe is an enumeration and search for parts!
        if (res.isEmpty()) {
            String[] newTerms = getEnumeratedTerms(searchTerm);
            for (String term : newTerms) {
                // ignore terms consisting of digits only
                if (!term.trim().matches("\\d+\\s*")) {
                    res.putAll(match(term.trim()));
                }
            }
        }
		// write results to cache
        // empty results in the cache are valuable too -> prevents repeated searching for non-registered studies
        if (this.queryCache != null) {
            writeToCache(url.toString(), res);
        }
        return res;
    }

    /**
     * Writes the results of a dara query to the cache found in specified
     * cacheFilename path.
     *
     * @param url	the dara query url
     * @param res	the parsed dara response for the specified query url
     * @param cacheFilename	path of the cache file
     */
    private void writeToCache(String url, Map<String, String> res) {
        String delimiter = "--@--";
        String newLine = url;

        for (String key : res.keySet()) {
            newLine = newLine + delimiter + res.get(key) + delimiter + key;
        }
        try {
            System.out.println("Writing query to cache: " + newLine);
            File f = new File(this.queryCache);
            OutputStreamWriter fstream = new OutputStreamWriter(new FileOutputStream(f, true), "UTF-8");
            BufferedWriter out = new BufferedWriter(fstream);
            out.write(newLine + System.getProperty("line.separator"));
            out.close();
        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    /**
     * Queries "http://www.da-ra.de/dara/study/web_search_show" for the
     * specified dataset name. Optionally uses cache file and external dataset
     * URL list if specified.
     *
     * @param args	args[0]: dataset name(s); args[1]: path of cache file or
     * null; args[2]: path of URL list or null
     */
    /*
     public static void main(String[] args)
     {
     if (args.length == 0) {
     System.out.println("Usage: StudyMatcher <datasetName> [<cacheFile>] [<urlListFile>]");
     System.out.println("	<datasetName>	dataset name");
     System.out.println("	<cacheFile>	path of cache file");
     System.out.println("	<urlListFile>	name of file containing URL list");
     System.exit(1);
     }
     if (args.length == 3) { StudyMatcher matcher = new StudyMatcher("http://www.da-ra.de/dara/study/web_search_show", args[1], args[2]); System.out.println(matcher.match(args[0]));}
     if (args.length == 2) { StudyMatcher matcher = new StudyMatcher("http://www.da-ra.de/dara/study/web_search_show", args[1], null); System.out.println(matcher.match(args[0]));}
     if (args.length == 1) { StudyMatcher matcher = new StudyMatcher("http://www.da-ra.de/dara/study/web_search_show", null, null); System.out.println(matcher.match(args[0]));}
     }*/
}