package cn.rongcloud.im.ui.widget.linkpreview;
import android.os.AsyncTask;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
public class TextCrawler {
public static final int ALL = -1;
public static final int NONE = -2;
private final String HTTP_PROTOCOL = "http://";
private final String HTTPS_PROTOCOL = "https://";
private LinkPreviewCallback callback;
public TextCrawler() {
}
public void makePreview(LinkPreviewCallback callback, String url) {
this.callback = callback;
new GetCode(ALL).execute(url);
}
public void makePreview(LinkPreviewCallback callback, String url,
int imageQuantity) {
this.callback = callback;
new GetCode(imageQuantity).execute(url);
}
/** Get html code */
public class GetCode extends AsyncTask<String, Void, Void> {
private SourceContent sourceContent = new SourceContent();
private int imageQuantity;
private ArrayList<String> urls;
public GetCode(int imageQuantity) {
this.imageQuantity = imageQuantity;
}
@Override
protected void onPreExecute() {
if (callback != null) {
callback.onPre();
}
super.onPreExecute();
}
@Override
protected void onPostExecute(Void result) {
if (callback != null) {
callback.onPos(sourceContent, isNull());
}
super.onPostExecute(result);
}
@Override
protected Void doInBackground(String... params) {
// Don't forget the http:// or https://
urls = SearchUrls.matches(params[0]);
if (urls.size() > 0)
sourceContent
.setFinalUrl(unshortenUrl(extendedTrim(urls.get(0))));
else
sourceContent.setFinalUrl("");
if (!sourceContent.getFinalUrl().equals("")) {
if (isImage(sourceContent.getFinalUrl())
&& !sourceContent.getFinalUrl().contains("dropbox")) {
sourceContent.setSuccess(true);
sourceContent.getImages().add(sourceContent.getFinalUrl());
sourceContent.setTitle("");
sourceContent.setDescription("");
} else {
try {
Document doc = Jsoup
.connect(sourceContent.getFinalUrl())
.userAgent("Mozilla").get();
sourceContent.setHtmlCode(extendedTrim(doc.toString()));
HashMap<String, String> metaTags = getMetaTags(sourceContent
.getHtmlCode());
sourceContent.setMetaTags(metaTags);
sourceContent.setTitle(metaTags.get("title"));
sourceContent.setDescription(metaTags
.get("description"));
if (sourceContent.getTitle().equals("")) {
String matchTitle = Regex.pregMatch(
sourceContent.getHtmlCode(),
Regex.TITLE_PATTERN, 2);
if (!matchTitle.equals(""))
sourceContent.setTitle(htmlDecode(matchTitle));
}
if (sourceContent.getDescription().equals(""))
sourceContent
.setDescription(crawlCode(sourceContent
.getHtmlCode()));
sourceContent.setDescription(sourceContent
.getDescription().replaceAll(
Regex.SCRIPT_PATTERN, ""));
if (imageQuantity != NONE) {
if (!metaTags.get("image").equals(""))
sourceContent.getImages().add(
metaTags.get("image"));
else {
sourceContent.setImages(getImages(doc,
imageQuantity));
}
}
sourceContent.setSuccess(true);
} catch (Exception e) {
sourceContent.setSuccess(false);
}
}
}
String[] finalLinkSet = sourceContent.getFinalUrl().split("&");
sourceContent.setUrl(finalLinkSet[0]);
sourceContent.setCannonicalUrl(cannonicalPage(sourceContent
.getFinalUrl()));
sourceContent.setDescription(stripTags(sourceContent
.getDescription()));
return null;
}
/** Verifies if the content could not be retrieved */
public boolean isNull() {
return !sourceContent.isSuccess() &&
extendedTrim(sourceContent.getHtmlCode()).equals("") &&
!isImage(sourceContent.getFinalUrl());
}
}
/** Gets content from a html tag */
private String getTagContent(String tag, String content) {
String pattern = "<" + tag + "(.*?)>(.*?)</" + tag + ">";
String result = "", currentMatch = "";
List<String> matches = Regex.pregMatchAll(content, pattern, 2);
int matchesSize = matches.size();
for (int i = 0; i < matchesSize; i++) {
currentMatch = stripTags(matches.get(i));
if (currentMatch.length() >= 120) {
result = extendedTrim(currentMatch);
break;
}
}
if (result.equals("")) {
String matchFinal = Regex.pregMatch(content, pattern, 2);
result = extendedTrim(matchFinal);
}
result = result.replaceAll(" ", "");
return htmlDecode(result);
}
/** Gets images from the html code */
public List<String> getImages(Document document, int imageQuantity) {
List<String> matches = new ArrayList<String>();
Elements media = document.select("[src]");
for (Element srcElement : media) {
if (srcElement.tagName().equals("img")) {
matches.add(srcElement.attr("abs:src"));
}
}
if (imageQuantity != ALL)
matches = matches.subList(0, imageQuantity);
return matches;
}
/** Transforms from html to normal string */
private String htmlDecode(String content) {
return Jsoup.parse(content).text();
}
/** Crawls the code looking for relevant information */
private String crawlCode(String content) {
String result = "";
String resultSpan = "";
String resultParagraph = "";
String resultDiv = "";
resultSpan = getTagContent("span", content);
resultParagraph = getTagContent("p", content);
resultDiv = getTagContent("div", content);
result = resultSpan;
if (resultParagraph.length() > resultSpan.length()
&& resultParagraph.length() >= resultDiv.length())
result = resultParagraph;
else if (resultParagraph.length() > resultSpan.length()
&& resultParagraph.length() < resultDiv.length())
result = resultDiv;
else
result = resultParagraph;
return htmlDecode(result);
}
/** Returns the cannoncial url */
private String cannonicalPage(String url) {
String cannonical = "";
if (url.startsWith(HTTP_PROTOCOL)) {
url = url.substring(HTTP_PROTOCOL.length());
} else if (url.startsWith(HTTPS_PROTOCOL)) {
url = url.substring(HTTPS_PROTOCOL.length());
}
int urlLength = url.length();
for (int i = 0; i < urlLength; i++) {
if (url.charAt(i) != '/')
cannonical += url.charAt(i);
else
break;
}
return cannonical;
}
/** Strips the tags from an element */
private String stripTags(String content) {
return Jsoup.parse(content).text();
}
/** Verifies if the url is an image */
private boolean isImage(String url) {
return url.matches(Regex.IMAGE_PATTERN);
}
/**
* Returns meta tags from html code
*/
private HashMap<String, String> getMetaTags(String content) {
HashMap<String, String> metaTags = new HashMap<String, String>();
metaTags.put("url", "");
metaTags.put("title", "");
metaTags.put("description", "");
metaTags.put("image", "");
List<String> matches = Regex.pregMatchAll(content,
Regex.METATAG_PATTERN, 1);
for (String match : matches) {
final String lowerCase = match.toLowerCase();
if (lowerCase.contains("property=\"og:url\"")
|| lowerCase.contains("property='og:url'")
|| lowerCase.contains("name=\"url\"")
|| lowerCase.contains("name='url'"))
updateMetaTag(metaTags, "url", separeMetaTagsContent(match));
else if (lowerCase.contains("property=\"og:title\"")
|| lowerCase.contains("property='og:title'")
|| lowerCase.contains("name=\"title\"")
|| lowerCase.contains("name='title'"))
updateMetaTag(metaTags, "title", separeMetaTagsContent(match));
else if (lowerCase
.contains("property=\"og:description\"")
|| lowerCase
.contains("property='og:description'")
|| lowerCase.contains("name=\"description\"")
|| lowerCase.contains("name='description'"))
updateMetaTag(metaTags, "description", separeMetaTagsContent(match));
else if (lowerCase.contains("property=\"og:image\"")
|| lowerCase.contains("property='og:image'")
|| lowerCase.contains("name=\"image\"")
|| lowerCase.contains("name='image'"))
updateMetaTag(metaTags, "image", separeMetaTagsContent(match));
}
return metaTags;
}
private void updateMetaTag(HashMap<String, String> metaTags, String url, String value) {
if (value != null && (value.length() > 0)) {
metaTags.put(url, value);
}
}
/** Gets content from metatag */
private String separeMetaTagsContent(String content) {
String result = Regex.pregMatch(content, Regex.METATAG_CONTENT_PATTERN,
1);
return htmlDecode(result);
}
/**
* Unshortens a short url
*/
private String unshortenUrl(String shortURL) {
if (!shortURL.startsWith(HTTP_PROTOCOL)
&& !shortURL.startsWith(HTTPS_PROTOCOL))
return "";
URLConnection urlConn = connectURL(shortURL);
urlConn.getHeaderFields();
String finalResult = urlConn.getURL().toString();
urlConn = connectURL(finalResult);
urlConn.getHeaderFields();
shortURL = urlConn.getURL().toString();
while (!shortURL.equals(finalResult)) {
finalResult = unshortenUrl(finalResult);
}
return finalResult;
}
/**
* Takes a valid url and return a URL object representing the url address.
*/
private URLConnection connectURL(String strURL) {
URLConnection conn = null;
try {
URL inputURL = new URL(strURL);
conn = inputURL.openConnection();
} catch (MalformedURLException e) {
System.out.println("Please input a valid URL");
} catch (IOException ioe) {
System.out.println("Can not connect to the URL");
}
return conn;
}
/** Removes extra spaces and trim the string */
public static String extendedTrim(String content) {
return content.replaceAll("\\s+", " ").replace("\n", " ")
.replace("\r", " ").trim();
}
}