package ml.puredark.hviewer.core;
import android.text.TextUtils;
import com.google.gson.Gson;
import com.google.gson.JsonArray;
import com.google.gson.JsonElement;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import com.google.gson.JsonPrimitive;
import com.jayway.jsonpath.JsonPath;
import com.jayway.jsonpath.ReadContext;
import org.jsoup.Jsoup;
import org.jsoup.helper.StringUtil;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import ml.puredark.hviewer.beans.Collection;
import ml.puredark.hviewer.beans.Comment;
import ml.puredark.hviewer.beans.Picture;
import ml.puredark.hviewer.beans.Rule;
import ml.puredark.hviewer.beans.Selector;
import ml.puredark.hviewer.beans.Tag;
import ml.puredark.hviewer.beans.Video;
import ml.puredark.hviewer.helpers.Logger;
import ml.puredark.hviewer.utils.MathUtil;
import ml.puredark.hviewer.utils.RegexValidateUtil;
import ml.puredark.hviewer.utils.StringEscapeUtils;
import static java.util.regex.Pattern.DOTALL;
/**
* Created by PureDark on 2016/8/9.
*/
public class RuleParser {
public static Map<String, String> parseUrl(String url) {
Map<String, String> map = new HashMap<>();
if (TextUtils.isEmpty(url))
return map;
Pattern pattern = Pattern.compile("\\{([^{}]*?):([^{}]*?)\\}", DOTALL);
Matcher matcher = pattern.matcher(url);
while (matcher.find()) {
map.put(matcher.group(1), matcher.group(2));
}
Pattern pattern2 = Pattern.compile("\\{([^{}]*?):([^{}]*?\\{[^{}]*?\\}[^{}]*?)\\}", DOTALL);
Matcher matcher2 = pattern2.matcher(url);
while (matcher2.find()) {
map.put(matcher2.group(1), matcher2.group(2));
}
Pattern pattern3 = Pattern.compile("\\{(json):(.*)\\}", DOTALL);
Matcher matcher3 = pattern3.matcher(url);
while (matcher3.find()) {
map.put(matcher3.group(1), matcher3.group(2));
}
return map;
}
public static String parseUrl(String url, int page, String idCode, String keyword, Object[] objs){
return parseUrl(url, page, idCode, keyword, objs, false);
}
public static String parseUrl(String url, int page, String idCode, String keyword, Object[] objs, boolean getJsonParams) {
Map<String, String> matchResult = RuleParser.parseUrl(url);
if(getJsonParams && !matchResult.containsKey("json"))
return null;
String result = (getJsonParams) ? matchResult.get("json") : url;
String pageStr = matchResult.get("page");
int startPage = 0;
int pageStep = 1;
try {
if ("minid".equals(pageStr) && objs != null) {
int min = Integer.MAX_VALUE;
for (Object obj : objs) {
if (obj instanceof Collection)
min = Math.min(min, Integer.parseInt(((Collection) obj).idCode.replaceAll("[^0-9]", "")));
else if (obj instanceof Picture)
min = Math.min(min, ((Picture) obj).pid);
}
page = min;
} else if ("maxid".equals(pageStr) && objs != null) {
int max = Integer.MIN_VALUE;
for (Object obj : objs) {
if (obj instanceof Collection)
max = Math.max(max, Integer.parseInt(((Collection) obj).idCode.replaceAll("[^0-9]", "")));
else if (obj instanceof Picture)
max = Math.max(max, ((Picture) obj).pid);
}
page = max;
} else if (pageStr != null) {
String[] pageStrs = pageStr.split(":");
if (pageStrs.length > 1) {
pageStep = Integer.parseInt(pageStrs[1]);
startPage = Integer.parseInt(pageStrs[0]);
} else {
pageStep = 1;
startPage = Integer.parseInt(pageStr);
}
}
} catch (NumberFormatException e) {
}
if (page < startPage)
page = startPage;
int realPage = page + (page - startPage) * (pageStep - 1);
result = result.replaceAll("\\{pageStr:(.*?\\{.*?\\}.*?)\\}", (realPage == startPage) ? "" : matchResult.get("pageStr"))
.replaceAll("\\{page:.*?\\}", "" + realPage)
.replaceAll("\\{keyword:.*?\\}", keyword)
.replaceAll("\\{idCode:\\}", idCode);
if (matchResult.containsKey("date")) {
String dateStr = matchResult.get("date");
int index = dateStr.lastIndexOf(':');
Calendar calendar = Calendar.getInstance();
SimpleDateFormat dateFormat;
try {
if (index > 0) {
String firstParam = dateStr.substring(0, index);
String lastParam = dateStr.substring(index + 1);
int offset = Integer.parseInt(lastParam);
calendar.add(Calendar.DAY_OF_MONTH, offset);
dateFormat = new SimpleDateFormat(firstParam);
} else {
dateFormat = new SimpleDateFormat(dateStr);
}
} catch (Exception e) {
dateFormat = new SimpleDateFormat(dateStr);
}
String currDate = dateFormat.format(calendar.getTime());
result = result.replaceAll("\\{date:.*?\\}", currDate);
}
if (matchResult.containsKey("time")) {
String timeStr = matchResult.get("time");
int index = timeStr.lastIndexOf(':');
Calendar calendar = Calendar.getInstance();
SimpleDateFormat dateFormat;
try {
if (index > 0) {
String firstParam = timeStr.substring(0, index);
String lastParam = timeStr.substring(index + 1);
int offset = Integer.parseInt(lastParam);
dateFormat = new SimpleDateFormat(firstParam);
calendar.add(Calendar.SECOND, offset);
} else {
dateFormat = new SimpleDateFormat(timeStr);
}
} catch (NumberFormatException e) {
dateFormat = new SimpleDateFormat(timeStr);
}
String currTime = dateFormat.format(calendar.getTime());
result = result.replaceAll("\\{time:.*?\\}", currTime);
}
return result;
}
public static boolean isJson(String string) {
if (string == null)
return false;
string = string.trim();
return string.startsWith("{") || string.startsWith("[");
}
public static JsonArray getJsonArray(ReadContext ctx, String jsonPaths) {
JsonArray items = new JsonArray();
String[] paths = jsonPaths.split(",");
for (int i = 0; i < paths.length; i++) {
JsonElement element;
try {
element = ctx.read(paths[i], JsonElement.class);
} catch (Exception e) {
Logger.d("RuleParser", "path[" + i + "]:" + paths[i]);
//e.printStackTrace();
try {
if (paths.length > i + 1) {
element = ctx.read(paths[i] + paths[i + 1], JsonElement.class);
i++;
} else
continue;
} catch (Exception e1) {
//e.printStackTrace();
continue;
}
}
if (element == null || element.isJsonNull())
continue;
if (element instanceof JsonArray) {
items.addAll(element.getAsJsonArray());
} else {
items.add(element);
}
}
return items;
}
public static List<Collection> getCollections(List<Collection> collections, String text, Rule rule, String sourceUrl) {
return getCollections(collections, text, rule, sourceUrl, false);
}
public static List<Collection> getCollections(List<Collection> collections, String text, Rule rule, String sourceUrl, boolean noRegex) {
try {
Iterable items;
if (!isJson(text)) {
Document doc = Jsoup.parse(text);
items = doc.select(rule.item.selector);
for (Object item : items) {
String itemStr;
if (item instanceof Element) {
if ("attr".equals(rule.item.fun))
itemStr = ((Element) item).attr(rule.title.param);
else if ("html".equals(rule.item.fun))
itemStr = ((Element) item).html();
else if ("text".equals(rule.item.fun))
itemStr = ((Element) item).text();
else
itemStr = item.toString();
} else
continue;
if (!noRegex && rule.item.regex != null) {
Pattern pattern = Pattern.compile(rule.item.regex);
Matcher matcher = pattern.matcher(itemStr);
Logger.d("RuleParser", "beforeMatch");
if (!matcher.find()) {
continue;
} else if (matcher.groupCount() >= 1) {
Logger.d("RuleParser", "matcher.groupCount() >= 1");
if (rule.item.replacement != null) {
itemStr = rule.item.replacement;
for (int i = 1; i <= matcher.groupCount(); i++) {
String replace = matcher.group(i);
itemStr = itemStr.replaceAll("\\$" + i, (replace != null) ? replace : "");
}
} else {
itemStr = matcher.group(1);
}
}
}
if (rule.item.path != null && isJson(itemStr)) {
Logger.d("RuleParser", "isJson : true");
collections = getCollections(collections, itemStr, rule, sourceUrl, true);
} else {
Collection collection = new Collection(collections.size() + 1);
collection = getCollectionDetail(collection, item, rule, sourceUrl);
collections.add(collection);
}
}
} else {
ReadContext ctx = JsonPath.parse(text);
items = getJsonArray(ctx, rule.item.path);
Logger.d("RuleParser", items.toString());
for (Object item : items) {
String itemStr;
if (item instanceof JsonElement)
itemStr = item.toString();
else
continue;
if (!noRegex && rule.item.regex != null) {
Pattern pattern = Pattern.compile(rule.item.regex);
Matcher matcher = pattern.matcher(itemStr);
if (!matcher.find()) {
continue;
} else if (matcher.groupCount() >= 1) {
if (rule.item.replacement != null) {
itemStr = rule.item.replacement;
for (int i = 1; i <= matcher.groupCount(); i++) {
String replace = matcher.group(i);
itemStr = itemStr.replaceAll("\\$" + i, (replace != null) ? replace : "");
}
} else {
itemStr = matcher.group(1);
}
}
}
if (rule.item.selector != null && !isJson(itemStr)) {
Logger.d("RuleParser", "isJson : false");
collections = getCollections(collections, itemStr, rule, sourceUrl, true);
} else {
Collection collection = new Collection(collections.size() + 1);
collection = getCollectionDetail(collection, item, rule, sourceUrl);
collections.add(collection);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
return collections;
}
public static Collection getCollectionDetail(Collection collection, String text, Rule rule, String sourceUrl) {
if (rule == null)
return collection;
try {
if (rule.item != null && rule.pictureRule != null && rule.pictureRule.item != null) {
List<Collection> collections = new ArrayList<>();
Collection newCollection = getCollections(collections, text, rule, sourceUrl).get(0);
collection.replace(newCollection);
} else {
if (!isJson(text)) {
Document element = Jsoup.parse(text);
collection = getCollectionDetail(collection, element, rule, sourceUrl);
} else {
JsonElement elemet = new JsonParser().parse(text);
collection = getCollectionDetail(collection, elemet, rule, sourceUrl);
}
}
} catch (Exception e) {
e.printStackTrace();
}
return collection;
}
public static Collection getCollectionDetail(Collection collection, Object source, Rule rule, String sourceUrl) throws Exception {
String idCode = parseSingleProperty(source, rule.idCode, sourceUrl, false);
String title = parseSingleProperty(source, rule.title, sourceUrl, false);
String uploader = parseSingleProperty(source, rule.uploader, sourceUrl, false);
String cover = parseSingleProperty(source, rule.cover, sourceUrl, true);
String category = parseSingleProperty(source, rule.category, sourceUrl, false);
String datetime = parseSingleProperty(source, rule.datetime, sourceUrl, false);
String description = parseSingleProperty(source, rule.description, sourceUrl, false);
if (source instanceof Element) {
try {
Element element = Jsoup.parse(description);
element.select("iframe").remove();
element.select("script").remove();
description = element.select("body").html();
} catch (Exception e) {
e.printStackTrace();
}
}
String ratingStr = parseSingleProperty(source, rule.rating, sourceUrl, false);
float rating;
if (ratingStr.matches("\\d+(.\\d+)?") && ratingStr.indexOf(".") > 0) {
rating = Float.parseFloat(ratingStr);
} else if (StringUtil.isNumeric(ratingStr)) {
rating = Float.parseFloat(ratingStr);
} else {
String result = MathUtil.computeString(ratingStr);
try {
rating = result.contains("NaN") ? 0 : Float.parseFloat(result);
} catch (NumberFormatException e) {
rating = Math.min(ratingStr.replace(" ", "").length(), 5);
}
}
Iterable temp;
JsonParser jsonParser = new JsonParser();
List<Tag> tags = new ArrayList<>();
if (rule.tagRule != null && rule.tagRule.item != null) {
temp = parseItemMatchAll(source, rule.tagRule.item, sourceUrl);
for (Object element : temp) {
if (rule.tagRule.item.regex != null) {
Pattern pattern = Pattern.compile(rule.tagRule.item.regex);
Matcher matcher = pattern.matcher(element.toString());
if (!matcher.find()) {
continue;
}
}
String tagTitle = parseSingleProperty(element, rule.tagRule.title, sourceUrl, false);
String tagUrl = parseSingleProperty(element, rule.tagRule.url, sourceUrl, true);
if (TextUtils.isEmpty(tagUrl))
tagUrl = null;
tags.add(new Tag(tags.size() + 1, tagTitle, tagUrl));
}
} else if (rule.tags != null) {
List<String> tagStrs = parseSinglePropertyMatchAll(source, rule.tags, sourceUrl, false);
for (String tagStr : tagStrs) {
if (!TextUtils.isEmpty(tagStr))
tags.add(new Tag(tags.size() + 1, tagStr));
}
}
List<Picture> pictures = new ArrayList<>();
Selector pictureId = null, pictureItem = null, pictureThumbnail = null, pictureUrl = null, pictureHighRes = null;
if (rule.pictureRule != null && rule.pictureRule.url != null && rule.pictureRule.thumbnail != null) {
pictureId = rule.pictureRule.id;
pictureItem = rule.pictureRule.item;
pictureThumbnail = rule.pictureRule.thumbnail;
pictureUrl = rule.pictureRule.url;
pictureHighRes = rule.pictureRule.highRes;
} else if (rule.pictureUrl != null && rule.pictureThumbnail != null) {
pictureId = rule.pictureId;
pictureItem = rule.item;
pictureThumbnail = rule.pictureThumbnail;
pictureUrl = rule.pictureUrl;
pictureHighRes = rule.pictureHighRes;
}
if (pictureUrl != null && pictureThumbnail != null) {
if (pictureItem != null) {
temp = parseItemMatchAll(source, pictureItem, sourceUrl);
for (Object element : temp) {
if (pictureItem.regex != null) {
Pattern pattern = Pattern.compile(pictureItem.regex);
Matcher matcher = pattern.matcher(element.toString());
if (!matcher.find()) {
continue;
}
}
String pId = parseSingleProperty(element, pictureId, sourceUrl, false);
int pid;
try {
pid = Integer.parseInt(pId);
} catch (Exception e) {
pid = 0;
}
pid = (pid != 0) ? pid : (pictures.size() > 0) ? pictures.get(pictures.size() - 1).pid + 1 : pictures.size() + 1;
String pUrl = parseSingleProperty(element, pictureUrl, sourceUrl, true);
String PHighRes = parseSingleProperty(element, pictureHighRes, sourceUrl, true);
String pThumbnail = parseSingleProperty(element, pictureThumbnail, sourceUrl, true);
pictures.add(new Picture(pid, pUrl, pThumbnail, PHighRes, sourceUrl));
}
} else {
List<String> pids = parseSinglePropertyMatchAll(source, pictureId, sourceUrl, false);
List<String> urls = parseSinglePropertyMatchAll(source, pictureUrl, sourceUrl, true);
List<String> thumbnails = parseSinglePropertyMatchAll(source, pictureThumbnail, sourceUrl, true);
List<String> highReses = parseSinglePropertyMatchAll(source, pictureHighRes, sourceUrl, true);
for (int i = 0; i < urls.size(); i++) {
String pId = (i < pids.size()) ? pids.get(i) : "";
int pid;
try {
pid = Integer.parseInt(pId);
} catch (Exception e) {
pid = 0;
}
pid = (pid != 0) ? pid : (pictures.size() > 0) ? pictures.get(pictures.size() - 1).pid + 1 : pictures.size() + 1;
String url = urls.get(i);
String thumbnail = (i < thumbnails.size()) ? thumbnails.get(i) : "";
String highRes = (i < highReses.size()) ? highReses.get(i) : "";
pictures.add(new Picture(pid, url, thumbnail, highRes, sourceUrl));
}
}
}
List<Video> videos = new ArrayList<>();
if (rule.videoRule != null && rule.videoRule.item != null) {
temp = parseItemMatchAll(source, rule.videoRule.item, sourceUrl);
for (Object element : temp) {
if (rule.videoRule.item.regex != null) {
Pattern pattern = Pattern.compile(rule.videoRule.item.regex);
Matcher matcher = pattern.matcher(element.toString());
if (!matcher.find()) {
continue;
}
}
String vId = parseSingleProperty(element, rule.videoRule.id, sourceUrl, false);
int vid;
try {
vid = Integer.parseInt(vId);
} catch (Exception e) {
vid = 0;
}
vid = (vid != 0) ? vid : (videos.size() > 0) ? videos.get(videos.size() - 1).vid + 1 : videos.size() + 1;
String vThumbnail = parseSingleProperty(element, rule.videoRule.thumbnail, sourceUrl, true);
if (TextUtils.isEmpty(vThumbnail))
vThumbnail = (TextUtils.isEmpty(cover)) ? collection.cover : cover;
String vContent = parseSingleProperty(element, rule.videoRule.content, sourceUrl, true);
videos.add(new Video(vid, vThumbnail, vContent));
}
}
Selector commentItem = null, commentAvatar = null, commentAuthor = null, commentDatetime = null, commentContent = null;
List<Comment> comments = new ArrayList<>();
if (rule.commentRule != null && rule.commentRule.item != null && rule.commentRule.content != null) {
commentItem = rule.commentRule.item;
commentAvatar = rule.commentRule.avatar;
commentAuthor = rule.commentRule.author;
commentDatetime = rule.commentRule.datetime;
commentContent = rule.commentRule.content;
} else if (rule.commentItem != null && rule.commentContent != null) {
commentItem = rule.commentItem;
commentAvatar = rule.commentAvatar;
commentAuthor = rule.commentAuthor;
commentDatetime = rule.commentDatetime;
commentContent = rule.commentContent;
}
if (commentItem != null && commentContent != null) {
temp = parseItemMatchAll(source, commentItem, sourceUrl);
for (Object element : temp) {
if (commentItem.regex != null) {
Pattern pattern = Pattern.compile(commentItem.regex);
Matcher matcher = pattern.matcher(element.toString());
if (!matcher.find()) {
continue;
}
}
String cAvatar = parseSingleProperty(element, commentAvatar, sourceUrl, false);
String cAuthor = parseSingleProperty(element, commentAuthor, sourceUrl, false);
String cDatetime = parseSingleProperty(element, commentDatetime, sourceUrl, false);
String cContent = parseSingleProperty(element, commentContent, sourceUrl, false);
comments.add(new Comment(comments.size() + 1, cAvatar, cAuthor, cDatetime, cContent, sourceUrl));
}
}
if (!TextUtils.isEmpty(idCode))
collection.idCode = idCode;
if (!TextUtils.isEmpty(title))
collection.title = title;
if (!TextUtils.isEmpty(uploader))
collection.uploader = uploader;
if (!TextUtils.isEmpty(cover))
collection.cover = cover;
if (!TextUtils.isEmpty(category))
collection.category = category;
if (!TextUtils.isEmpty(datetime))
collection.datetime = datetime;
if (!TextUtils.isEmpty(description))
collection.description = description;
if (rating > 0)
collection.rating = rating;
if (!TextUtils.isEmpty(sourceUrl))
collection.referer = sourceUrl;
if (tags != null && tags.size() > 0)
collection.tags = tags;
if (pictures != null && pictures.size() > 0)
collection.pictures = pictures;
if (videos != null && videos.size() > 0)
collection.videos = videos;
if (comments != null && comments.size() > 0)
collection.comments = comments;
return collection;
}
public static List<Object> parseItemMatchAll(Object source, Selector selector, String sourceUrl) throws Exception {
List<Object> items = new ArrayList<>();
if (selector != null) {
String prop;
if (source instanceof Element) {
Elements temp = ("this".equals(selector.selector)) ? new Elements((Element) source) : ((Element) source).select(selector.selector);
if (temp != null) {
boolean doJsonParse = !TextUtils.isEmpty(selector.path);
for (Element elem : temp) {
if(doJsonParse){
if ("attr".equals(selector.fun)) {
prop = elem.attr(selector.param);
} else if ("html".equals(selector.fun)) {
prop = elem.html();
} else if ("text".equals(selector.fun)) {
prop = elem.text();
} else {
prop = elem.toString();
}
List<String> props = getPropertyAfterRegex(new ArrayList<>(), prop, selector, sourceUrl, false);
for(String string : props){
ReadContext ctx = JsonPath.parse(string);
JsonArray jsonArray = getJsonArray(ctx, selector.path);
for(JsonElement jsonElem : jsonArray){
items.add(jsonElem);
}
}
} else {
if (selector.regex != null) {
Pattern pattern = Pattern.compile(selector.regex);
Matcher matcher = pattern.matcher(elem.toString());
if (!matcher.find()) {
continue;
}
}
items.add(elem);
}
}
}
} else if (source instanceof JsonElement) {
ReadContext ctx = JsonPath.parse(source.toString());
JsonArray temp = getJsonArray(ctx, selector.path);
if (temp != null) {
boolean doDocument = !TextUtils.isEmpty(selector.selector);
for (JsonElement item : temp) {
if (doDocument) {
try {
if (item instanceof JsonPrimitive)
prop = item.getAsString();
else
continue;
Elements elements = ("this".equals(selector.selector)) ? new Elements(Jsoup.parse(prop)) : Jsoup.parse(prop).select(selector.selector);
items.addAll(elements);
} catch (Exception e) {
e.printStackTrace();
}
} else{
if (selector.regex != null) {
Pattern pattern = Pattern.compile(selector.regex);
Matcher matcher = pattern.matcher(item.toString());
if (!matcher.find()) {
continue;
}
}
items.add(item);
}
}
}
}
}
return items;
}
public static String parseSingleProperty(Object source, Selector selector, String sourceUrl, boolean isUrl) throws Exception {
List<String> props = parseSinglePropertyMatchAll(source, selector, sourceUrl, isUrl);
return (props.size() > 0) ? props.get(0) : "";
}
public static List<String> parseSinglePropertyMatchAll(Object source, Selector selector, String sourceUrl, boolean isUrl) throws Exception {
List<String> props = new ArrayList<>();
if (selector != null) {
String prop;
if (source instanceof Element) {
Elements temp = ("this".equals(selector.selector)) ? new Elements((Element) source) : ((Element) source).select(selector.selector);
if (temp != null) {
boolean doJsonParse = !TextUtils.isEmpty(selector.path);
for (Element elem : temp) {
if ("attr".equals(selector.fun)) {
prop = elem.attr(selector.param);
} else if ("html".equals(selector.fun)) {
prop = elem.html();
} else if ("text".equals(selector.fun)) {
prop = elem.text();
} else {
prop = elem.toString();
}
if (doJsonParse)
props = getPropertyAfterRegex(props, prop, selector, sourceUrl, false);
else
props = getPropertyAfterRegex(props, prop, selector, sourceUrl, isUrl);
}
if (doJsonParse) {
try {
for (int i = 0; i < props.size(); i++) {
prop = props.get(i);
ReadContext ctx = JsonPath.parse(prop);
JsonArray tempArray = getJsonArray(ctx, selector.path);
if (tempArray.size() > 0) {
JsonElement tempItem = tempArray.get(0);
if (tempItem instanceof JsonPrimitive)
prop = tempItem.getAsString();
else
prop = tempItem.toString();
if (!TextUtils.isEmpty(prop)) {
if (isUrl)
prop = RegexValidateUtil.getAbsoluteUrlFromRelative(prop, sourceUrl);
props.set(i, prop);
}
}
}
} catch (Exception e) {
e.printStackTrace();
}
}
}
} else if (source instanceof JsonElement) {
ReadContext ctx = JsonPath.parse(source.toString());
JsonArray temp = getJsonArray(ctx, selector.path);
if (temp != null) {
boolean doDocument = !TextUtils.isEmpty(selector.selector);
for (JsonElement item : temp) {
if (item instanceof JsonPrimitive)
prop = item.getAsString();
else
prop = item.toString();
if (doDocument) {
try {
Elements element = ("this".equals(selector.selector)) ? new Elements(Jsoup.parse(prop)) : Jsoup.parse(prop).select(selector.selector);
if ("attr".equals(selector.fun)) {
prop = element.attr(selector.param);
} else if ("html".equals(selector.fun)) {
prop = element.html();
} else if ("text".equals(selector.fun)) {
prop = element.text();
} else {
prop = element.toString();
}
if (!TextUtils.isEmpty(prop))
props = getPropertyAfterRegex(props, prop, selector, sourceUrl, isUrl);
} catch (Exception e) {
e.printStackTrace();
}
} else if (!TextUtils.isEmpty(prop) && !"null".equals(prop.trim())) {
props = getPropertyAfterRegex(props, prop, selector, sourceUrl, isUrl);
}
}
}
}
}
return props;
}
public static List<String> getPropertyAfterRegex(List<String> props, String prop, Selector selector, String sourceUrl, boolean isUrl) {
if (selector.regex != null) {
Pattern pattern = Pattern.compile(selector.regex, DOTALL);
Matcher matcher = pattern.matcher(prop);
while (matcher.find() && matcher.groupCount() >= 1) {
if (selector.replacement != null) {
prop = selector.replacement;
for (int i = 1; i <= matcher.groupCount(); i++) {
String replace = matcher.group(i);
prop = prop.replaceAll("\\$" + i, (replace != null) ? replace : "");
}
} else {
prop = matcher.group(1);
}
if (isUrl) {
if (TextUtils.isEmpty(prop))
break;
prop = RegexValidateUtil.getAbsoluteUrlFromRelative(prop, sourceUrl);
}
props.add(StringEscapeUtils.unescapeHtml(prop.trim()));
}
} else {
if (isUrl && !TextUtils.isEmpty(prop)) {
prop = RegexValidateUtil.getAbsoluteUrlFromRelative(prop, sourceUrl);
}
props.add(StringEscapeUtils.unescapeHtml(prop.trim()));
}
return props;
}
public static String getPictureUrl(String text, Selector selector, String sourceUrl) {
try {
if (!isJson(text)) {
Document doc = Jsoup.parse(text);
return parseSingleProperty(doc, selector, sourceUrl, true);
} else {
ReadContext ctx = JsonPath.parse(text);
return parseSingleProperty(ctx, selector, sourceUrl, true);
}
} catch (Exception e) {
e.printStackTrace();
return "";
}
}
public static List<String> getVideoUrl(String html, String sourceUrl) {
List<String> videoUrls = new ArrayList<>();
try {
Pattern p = Pattern.compile("https?[^\"'<>]*?[^\"'<>]+?\\.(?:mp4|flv)[^\"'<>]*", Pattern.CASE_INSENSITIVE);
Matcher matcher = p.matcher(html);
while (matcher.find()) {
String videoUrl = matcher.group();
if (TextUtils.isEmpty(videoUrl))
continue;
videoUrl = RegexValidateUtil.getAbsoluteUrlFromRelative(videoUrl, sourceUrl);
videoUrls.add(videoUrl);
}
} catch (Exception e) {
e.printStackTrace();
}
return videoUrls;
}
}