TextCrawler.java example

Explorer
sealtalk-android-master
package cn.rongcloud.im.ui.widget.linkpreview;

import android.os.AsyncTask;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;

public class TextCrawler {

    public static final int ALL = -1;
    public static final int NONE = -2;

    private final String HTTP_PROTOCOL = "http://";
    private final String HTTPS_PROTOCOL = "https://";

    private LinkPreviewCallback callback;

    public TextCrawler() {
    }

    public void makePreview(LinkPreviewCallback callback, String url) {
        this.callback = callback;
        new GetCode(ALL).execute(url);
    }

    public void makePreview(LinkPreviewCallback callback, String url,
                            int imageQuantity) {
        this.callback = callback;
        new GetCode(imageQuantity).execute(url);
    }

    /** Get html code */
    public class GetCode extends AsyncTask<String, Void, Void> {

        private SourceContent sourceContent = new SourceContent();
        private int imageQuantity;
        private ArrayList<String> urls;

        public GetCode(int imageQuantity) {
            this.imageQuantity = imageQuantity;
        }

        @Override
        protected void onPreExecute() {
            if (callback != null) {
                callback.onPre();
            }
            super.onPreExecute();
        }

        @Override
        protected void onPostExecute(Void result) {
            if (callback != null) {
                callback.onPos(sourceContent, isNull());
            }
            super.onPostExecute(result);
        }

        @Override
        protected Void doInBackground(String... params) {
            // Don't forget the http:// or https://
            urls = SearchUrls.matches(params[0]);

            if (urls.size() > 0)
                sourceContent
                .setFinalUrl(unshortenUrl(extendedTrim(urls.get(0))));
            else
                sourceContent.setFinalUrl("");

            if (!sourceContent.getFinalUrl().equals("")) {
                if (isImage(sourceContent.getFinalUrl())
                        && !sourceContent.getFinalUrl().contains("dropbox")) {
                    sourceContent.setSuccess(true);

                    sourceContent.getImages().add(sourceContent.getFinalUrl());

                    sourceContent.setTitle("");
                    sourceContent.setDescription("");

                } else {
                    try {
                        Document doc = Jsoup
                                       .connect(sourceContent.getFinalUrl())
                                       .userAgent("Mozilla").get();

                        sourceContent.setHtmlCode(extendedTrim(doc.toString()));

                        HashMap<String, String> metaTags = getMetaTags(sourceContent
                                                           .getHtmlCode());

                        sourceContent.setMetaTags(metaTags);

                        sourceContent.setTitle(metaTags.get("title"));
                        sourceContent.setDescription(metaTags
                                                     .get("description"));

                        if (sourceContent.getTitle().equals("")) {
                            String matchTitle = Regex.pregMatch(
                                                    sourceContent.getHtmlCode(),
                                                    Regex.TITLE_PATTERN, 2);

                            if (!matchTitle.equals(""))
                                sourceContent.setTitle(htmlDecode(matchTitle));
                        }

                        if (sourceContent.getDescription().equals(""))
                            sourceContent
                            .setDescription(crawlCode(sourceContent
                                                      .getHtmlCode()));

                        sourceContent.setDescription(sourceContent
                                                     .getDescription().replaceAll(
                                                         Regex.SCRIPT_PATTERN, ""));

                        if (imageQuantity != NONE) {
                            if (!metaTags.get("image").equals(""))
                                sourceContent.getImages().add(
                                    metaTags.get("image"));
                            else {
                                sourceContent.setImages(getImages(doc,
                                                                  imageQuantity));
                            }
                        }

                        sourceContent.setSuccess(true);
                    } catch (Exception e) {
                        sourceContent.setSuccess(false);
                    }
                }
            }

            String[] finalLinkSet = sourceContent.getFinalUrl().split("&");
            sourceContent.setUrl(finalLinkSet[0]);

            sourceContent.setCannonicalUrl(cannonicalPage(sourceContent
                                           .getFinalUrl()));
            sourceContent.setDescription(stripTags(sourceContent
                                                   .getDescription()));

            return null;
        }

        /** Verifies if the content could not be retrieved */
        public boolean isNull() {
            return !sourceContent.isSuccess() &&
                   extendedTrim(sourceContent.getHtmlCode()).equals("") &&
                   !isImage(sourceContent.getFinalUrl());
        }

    }

    /** Gets content from a html tag */
    private String getTagContent(String tag, String content) {

        String pattern = "<" + tag + "(.*?)>(.*?)</" + tag + ">";
        String result = "", currentMatch = "";

        List<String> matches = Regex.pregMatchAll(content, pattern, 2);

        int matchesSize = matches.size();
        for (int i = 0; i < matchesSize; i++) {
            currentMatch = stripTags(matches.get(i));
            if (currentMatch.length() >= 120) {
                result = extendedTrim(currentMatch);
                break;
            }
        }

        if (result.equals("")) {
            String matchFinal = Regex.pregMatch(content, pattern, 2);
            result = extendedTrim(matchFinal);
        }

        result = result.replaceAll(" ", "");

        return htmlDecode(result);
    }

    /** Gets images from the html code */
    public List<String> getImages(Document document, int imageQuantity) {
        List<String> matches = new ArrayList<String>();

        Elements media = document.select("[src]");

        for (Element srcElement : media) {
            if (srcElement.tagName().equals("img")) {
                matches.add(srcElement.attr("abs:src"));
            }
        }

        if (imageQuantity != ALL)
            matches = matches.subList(0, imageQuantity);

        return matches;
    }

    /** Transforms from html to normal string */
    private String htmlDecode(String content) {
        return Jsoup.parse(content).text();
    }

    /** Crawls the code looking for relevant information */
    private String crawlCode(String content) {
        String result = "";
        String resultSpan = "";
        String resultParagraph = "";
        String resultDiv = "";

        resultSpan = getTagContent("span", content);
        resultParagraph = getTagContent("p", content);
        resultDiv = getTagContent("div", content);

        result = resultSpan;

        if (resultParagraph.length() > resultSpan.length()
                && resultParagraph.length() >= resultDiv.length())
            result = resultParagraph;
        else if (resultParagraph.length() > resultSpan.length()
                 && resultParagraph.length() < resultDiv.length())
            result = resultDiv;
        else
            result = resultParagraph;

        return htmlDecode(result);
    }

    /** Returns the cannoncial url */
    private String cannonicalPage(String url) {

        String cannonical = "";
        if (url.startsWith(HTTP_PROTOCOL)) {
            url = url.substring(HTTP_PROTOCOL.length());
        } else if (url.startsWith(HTTPS_PROTOCOL)) {
            url = url.substring(HTTPS_PROTOCOL.length());
        }

        int urlLength = url.length();
        for (int i = 0; i < urlLength; i++) {
            if (url.charAt(i) != '/')
                cannonical += url.charAt(i);
            else
                break;
        }

        return cannonical;

    }

    /** Strips the tags from an element */
    private String stripTags(String content) {
        return Jsoup.parse(content).text();
    }

    /** Verifies if the url is an image */
    private boolean isImage(String url) {
        return url.matches(Regex.IMAGE_PATTERN);
    }

    /**
     * Returns meta tags from html code
     */
    private HashMap<String, String> getMetaTags(String content) {

        HashMap<String, String> metaTags = new HashMap<String, String>();
        metaTags.put("url", "");
        metaTags.put("title", "");
        metaTags.put("description", "");
        metaTags.put("image", "");

        List<String> matches = Regex.pregMatchAll(content,
                               Regex.METATAG_PATTERN, 1);

        for (String match : matches) {
            final String lowerCase = match.toLowerCase();
            if (lowerCase.contains("property=\"og:url\"")
                    || lowerCase.contains("property='og:url'")
                    || lowerCase.contains("name=\"url\"")
                    || lowerCase.contains("name='url'"))
                updateMetaTag(metaTags, "url", separeMetaTagsContent(match));
            else if (lowerCase.contains("property=\"og:title\"")
                     || lowerCase.contains("property='og:title'")
                     || lowerCase.contains("name=\"title\"")
                     || lowerCase.contains("name='title'"))
                updateMetaTag(metaTags, "title", separeMetaTagsContent(match));
            else if (lowerCase
                     .contains("property=\"og:description\"")
                     || lowerCase
                     .contains("property='og:description'")
                     || lowerCase.contains("name=\"description\"")
                     || lowerCase.contains("name='description'"))
                updateMetaTag(metaTags, "description", separeMetaTagsContent(match));
            else if (lowerCase.contains("property=\"og:image\"")
                     || lowerCase.contains("property='og:image'")
                     || lowerCase.contains("name=\"image\"")
                     || lowerCase.contains("name='image'"))
                updateMetaTag(metaTags, "image", separeMetaTagsContent(match));
        }

        return metaTags;
    }

    private void updateMetaTag(HashMap<String, String> metaTags, String url, String value) {
        if (value != null && (value.length() > 0)) {
            metaTags.put(url, value);
        }
    }

    /** Gets content from metatag */
    private String separeMetaTagsContent(String content) {
        String result = Regex.pregMatch(content, Regex.METATAG_CONTENT_PATTERN,
                                        1);
        return htmlDecode(result);
    }

    /**
     * Unshortens a short url
     */
    private String unshortenUrl(String shortURL) {
        if (!shortURL.startsWith(HTTP_PROTOCOL)
                && !shortURL.startsWith(HTTPS_PROTOCOL))
            return "";

        URLConnection urlConn = connectURL(shortURL);
        urlConn.getHeaderFields();

        String finalResult = urlConn.getURL().toString();

        urlConn = connectURL(finalResult);
        urlConn.getHeaderFields();

        shortURL = urlConn.getURL().toString();

        while (!shortURL.equals(finalResult)) {
            finalResult = unshortenUrl(finalResult);
        }

        return finalResult;
    }

    /**
     * Takes a valid url and return a URL object representing the url address.
     */
    private URLConnection connectURL(String strURL) {
        URLConnection conn = null;
        try {
            URL inputURL = new URL(strURL);
            conn = inputURL.openConnection();
        } catch (MalformedURLException e) {
            System.out.println("Please input a valid URL");
        } catch (IOException ioe) {
            System.out.println("Can not connect to the URL");
        }
        return conn;
    }

    /** Removes extra spaces and trim the string */
    public static String extendedTrim(String content) {
        return content.replaceAll("\\s+", " ").replace("\n", " ")
               .replace("\r", " ").trim();
    }

}