/******************************************************************************
*
* Copyright 2014 Paphus Solutions Inc.
*
* Licensed under the Eclipse Public License, Version 1.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.eclipse.org/legal/epl-v10.html
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
******************************************************************************/
package org.botlibre.sense.http;
import java.io.ByteArrayInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.io.StringReader;
import java.io.StringWriter;
import java.io.Writer;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.sql.Timestamp;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Queue;
import java.util.Set;
import java.util.concurrent.ConcurrentLinkedQueue;
import java.util.logging.Level;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathFactory;
import org.botlibre.Bot;
import org.botlibre.BotException;
import org.botlibre.api.knowledge.Network;
import org.botlibre.api.knowledge.Relationship;
import org.botlibre.api.knowledge.Vertex;
import org.botlibre.knowledge.Primitive;
import org.botlibre.sense.BasicSense;
import org.botlibre.util.TextStream;
import org.botlibre.util.Utils;
import org.htmlcleaner.HtmlCleaner;
import org.htmlcleaner.SimpleXmlSerializer;
import org.htmlcleaner.TagNode;
import org.w3c.dom.Attr;
import org.w3c.dom.CDATASection;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.InputSource;
import net.sf.json.JSON;
import net.sf.json.JSONArray;
import net.sf.json.JSONObject;
import net.sf.json.JSONSerializer;
/**
* Process http requests, gets and puts.
*/
public class Http extends BasicSense {
public static int WORKER_THREADS = 1;
protected ThreadLocal<DocumentBuilder> parser = new ThreadLocal<DocumentBuilder>();
protected ThreadLocal<HtmlCleaner> htmlCleaner = new ThreadLocal<HtmlCleaner>();
protected Map<String, Http> domains;
/**
* Thread implementation to allow multi-threading in URL processing.
*/
class WorkerThread implements Runnable {
Queue<URL> urls;
@Override
public void run() {
Network memory = getBot().memory().newMemory();
while (!urls.isEmpty()) {
URL url = urls.poll();
batchProcessURL(url, memory);
}
}
}
/**
* Process the URL as part of a batch.
*/
public void batchProcessURL(URL url, Network network) {
if (url == null) {
return;
}
log("Input", Level.FINE, url);
Element root = parseURL(url);
if (root != null) {
int attempt = 0;
Exception failure = null;
while (attempt < RETRY) {
attempt++;
try {
processRoot(root, url, network);
network.save();
break;
} catch (Exception failed) {
failure = failed;
log(failed.toString(), Level.WARNING);
log("Retrying", Level.WARNING);
}
}
if (attempt == RETRY) {
log("Retry failed", Level.WARNING);
log(failure);
}
}
}
public Http() {
this.domains = new HashMap<String, Http>();
}
/**
* Convert the HTML input stream into DOM parsable XHTML.
*/
public StringReader convertToXHTML(InputStream input) throws IOException {
StringWriter output = new StringWriter();
/*int next = input.read();
while (next != -1) {
output.write(next);
next = input.read();
}
String result = output.toString();
System.out.println(result);*/
TagNode node = getHtmlCleaner().clean(input, "UTF-8");
//TagNode node = getHtmlCleaner().clean(result);
node.serialize(new SimpleXmlSerializer(getHtmlCleaner().getProperties()), output);
output.flush();
String xhtml = output.toString();
return new StringReader(xhtml);
}
/**
* Convert the HTML input stream into DOM parsable XHTML.
*/
public String convertToXHTML(String html) throws IOException {
StringWriter output = new StringWriter();
TagNode node = getHtmlCleaner().clean(html);
node.serialize(new SimpleXmlSerializer(getHtmlCleaner().getProperties()), output);
output.flush();
return output.toString();
}
/**
* Stop sensing.
*/
@Override
public void shutdown() {
super.shutdown();
disconnect();
}
/**
* Reset state when instance is pooled.
*/
@Override
public void pool() {
disconnect();
}
public void disconnect() {
this.parser.remove();
}
public DocumentBuilder getParser() throws Exception {
if (this.parser.get() == null) {
DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
this.parser.set(factory.newDocumentBuilder());
}
return this.parser.get();
}
public HtmlCleaner getHtmlCleaner() {
if (this.htmlCleaner.get() == null) {
this.htmlCleaner.set(new HtmlCleaner());
}
return this.htmlCleaner.get();
}
/**
* Parse the input HTML into a DOM.
*/
public Element parseHTML(String html) throws Exception {
String xhtml = convertToXHTML(html);
StringReader reader = new StringReader(xhtml);
Document document = getParser().parse(new InputSource(reader));
return document.getDocumentElement();
}
/**
* Parse the input XHTML stream into a DOM.
*/
public Element parseXHTML(StringReader input) throws Exception {
Document document = getParser().parse(new InputSource(input));
return document.getDocumentElement();
}
/**
* Parse the input XML stream into a DOM.
*/
public Element parseXML(InputStream input) throws Exception {
Document document = getParser().parse(input, "UTF-8");
return document.getDocumentElement();
}
/**
* Get and process the URL.
*/
@Override
public void input(Object input, Network network) throws Exception {
if (!isEnabled()) {
return;
}
log("Input", Level.INFO, input);
URL url = null;
if (input instanceof URL) {
url = (URL)input;
} else if (input instanceof URI) {
url = (URL)((URI)input).toURL();
} else {
return;
}
// Redirect to specialization.
String domain = url.getHost();
Http domainSense = getDomains().get(domain);
if (domainSense != null) {
domainSense.input(url);
return;
}
// Parse the HTML as a DOM.
Element root = parseURL(url);
processRoot(root, url, network);
}
/**
* Parse the HTML as a DOM.
*/
public Element parseURL(URL url) {
try {
InputStream stream = Utils.openStream(url);
StringReader reader = convertToXHTML(stream);
return parseXHTML(reader);
} catch (FileNotFoundException notFound) {
log(notFound.toString(), Level.INFO);
return null;
} catch (Exception ioException) {
if (getBot().isDebugFine()) {
log(ioException);
} else {
log(ioException.toString(), Level.WARNING);
}
return null;
}
}
/**
* Self API.
* Return the XML data from the URL.
*/
public Vertex requestXML(Vertex source, Vertex url, Vertex xpath) {
Network network = source.getNetwork();
return requestXML(url.printString(), xpath.printString(), network);
}
/**
* Self API.
* Return the HTML data from the URL.
*/
public Vertex requestHTML(Vertex source, Vertex url, Vertex xpath) {
Network network = source.getNetwork();
return requestHTML(url.printString(), xpath.printString(), network);
}
/**
* Self API.
* Return the XML data from the URL.
*/
public Vertex requestXML(Vertex source, Vertex url) {
Network network = source.getNetwork();
return requestXML(url.printString(), network);
}
/**
* Return the XML data from the URL.
*/
public Vertex requestXML(String url, String xpath, Network network) {
log("GET XML", Level.INFO, url, xpath);
try {
Element element = parseXMLURL(new URL(url));
if (element == null) {
return null;
}
XPathFactory factory = XPathFactory.newInstance();
XPath path = factory.newXPath();
Object node = path.evaluate(xpath, element, XPathConstants.NODE);
if (node instanceof Element) {
return convertElement((Element)node, network);
} else if (node instanceof Attr) {
return network.createVertex(((Attr)node).getValue());
} else if (node instanceof org.w3c.dom.Text) {
return network.createVertex(((org.w3c.dom.Text)node).getTextContent());
}
return null;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Return the HTML data from the URL.
*/
public Vertex requestHTML(String url, String xpath, Network network) {
log("GET HTML", Level.INFO, url, xpath);
try {
Element element = parseURL(new URL(url));
if (element == null) {
return null;
}
XPathFactory factory = XPathFactory.newInstance();
XPath path = factory.newXPath();
Object node = path.evaluate(xpath, element, XPathConstants.NODE);
if (node instanceof Element) {
return convertElement((Element)node, network);
} else if (node instanceof Attr) {
return network.createVertex(((Attr)node).getValue());
} else if (node instanceof org.w3c.dom.Text) {
return network.createVertex(((org.w3c.dom.Text)node).getTextContent());
}
return null;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Return the XML data object from the URL.
*/
public Vertex requestXML(String url, Network network) {
log("GET XML", Level.INFO, url);
try {
Element element = parseXMLURL(new URL(url));
if (element == null) {
return null;
}
Vertex root = convertElement(element, network);
return root;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Self API.
* Return the JSON data object from the URL.
*/
public Vertex requestJSON(Vertex source, Vertex url) {
Network network = source.getNetwork();
return requestJSON(url.printString(), network);
}
/**
* Self API.
* Send a DELTE request to the URL.
*/
public Vertex delete(Vertex source, Vertex url) {
Network network = source.getNetwork();
return delete(url.printString(), network);
}
/**
* Self API.
* Return the CSV data object from the URL.
*/
public Vertex requestCSV(Vertex source, Vertex url) {
Network network = source.getNetwork();
return requestCSV(url.printString(), network);
}
/**
* Self API.
* Return the JSON data object from the URL.
*/
public Vertex requestJSON(Vertex source, Vertex attribute, Vertex url) {
Network network = source.getNetwork();
return requestJSON(url.printString(), attribute.printString(), network);
}
/**
* Return the CSV data object from the URL.
*/
public Vertex requestCSV(String url, Network network) {
log("GET CSV", Level.INFO, url);
try {
String csv = Utils.httpGET(url);
Vertex rows = network.createInstance(Primitive.ARRAY);
TextStream stream = new TextStream(csv);
boolean first = true;
List<Vertex> columns = new ArrayList<Vertex>();
while (!stream.atEnd()) {
String line = stream.nextLine().trim();
// Skip blank lines.
while (line.isEmpty()) {
if (stream.atEnd()) {
return rows;
}
line = stream.nextLine().trim();
}
// Allow either ',' or '","' separators.
boolean quotes = line.contains("\"");
// "questions","answer","topic"
// "What is this? What's this?","This is Open Bot.","Bot"
TextStream lineStream = new TextStream(line);
if (quotes) {
lineStream.skipTo('"');
lineStream.skip();
if (lineStream.atEnd()) {
getBot().log(this, "Expecting \" character", Level.WARNING, line);
continue;
}
}
if (first) {
// Process columns
while (!lineStream.atEnd()) {
String value = null;
if (quotes) {
value = lineStream.upToAll("\",\"").trim();
lineStream.skip("\",\"".length());
} else {
value = lineStream.upTo(',').trim();
lineStream.skip();
}
columns.add(network.createVertex(new Primitive(value)));
}
first = false;
} else {
Vertex object = null;
// Process values
int index = 0;
while (!lineStream.atEnd()) {
String value = null;
if (quotes) {
value = lineStream.upToAll("\",\"").trim();
lineStream.skip("\",\"".length());
} else {
value = lineStream.upTo(',').trim();
lineStream.skip();
}
if (lineStream.atEnd() && !value.isEmpty() && value.charAt(value.length() - 1) == '"') {
value = value.substring(0, value.length() - 1);
}
Vertex column = columns.get(index);
boolean data = false; //column.is(Primitive.DATA);
if (object == null) {
//if (data) {
// object = memory.createVertex(value);
//} else {
object = network.createVertex();
//}
}
if (!data && !value.isEmpty()) {
object.addRelationship((Primitive)column.getData(), network.createVertex(value));
}
index++;
}
if (object != null) {
rows.addRelationship(Primitive.ELEMENT, network.createVertex(object));
}
}
}
return rows;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Return the JSON data object from the URL.
*/
public Vertex requestJSON(String url, Network network) {
log("GET JSON", Level.INFO, url);
try {
String json = Utils.httpGET(url);
log("JSON", Level.FINE, json);
JSON root = (JSON)JSONSerializer.toJSON(json);
if (root == null) {
return null;
}
Vertex object = convertElement(root, network);
return object;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Send a DELETE request to the URL.
*/
public Vertex delete(String url, Network network) {
log("DELETE", Level.INFO, url);
try {
Utils.httpDELETE(url);
return network.createVertex(Primitive.TRUE);
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Return the JSON data object from the URL.
*/
public Vertex requestJSON(String url, String attribute, Network network) {
log("GET JSON", Level.INFO, url, attribute);
try {
String json = Utils.httpGET(url);
log("JSON", Level.FINE, json);
JSONObject root = (JSONObject)JSONSerializer.toJSON(json);
if (root == null) {
return null;
}
Object value = root.get(attribute);
if (value == null) {
return null;
}
Vertex object = convertElement(value, network);
return object;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Self API.
* POST the JSON object and return the XML data from the URL.
*/
public Vertex postJSON(Vertex source, Vertex url, Vertex jsonObject) {
Network network = source.getNetwork();
return postJSON(url.printString(), jsonObject, network);
}
/**
* Self API.
* PUT the JSON object and return the XML data from the URL.
*/
public Vertex putJSON(Vertex source, Vertex url, Vertex jsonObject) {
Network network = source.getNetwork();
return putJSON(url.printString(), jsonObject, network);
}
/**
* Self API.
* Post the JSON object and return the XML data from the URL.
*/
public Vertex postJSONAuth(Vertex source, Vertex url, Vertex user, Vertex password, Vertex jsonObject) {
Network network = source.getNetwork();
return postJSONAuth(url.printString(), user.printString(), password.printString(), jsonObject, network);
}
/**
* Self API.
* POST the XML document object and return the XML data from the URL.
*/
public Vertex postXML(Vertex source, Vertex url, Vertex xmlObject) {
Network network = source.getNetwork();
return postXML(url.printString(), xmlObject, network);
}
/**
* Post the XML document object and return the XML data from the URL.
*/
public Vertex postXML(String url, Vertex xmlObject, Network network) {
log("POST XML", Level.INFO, url);
try {
String data = convertToXML(xmlObject);
log("POST XML", Level.FINE, data);
String xml = Utils.httpPOST(url, "application/xml", data);
log("XML", Level.FINE, xml);
InputStream stream = new ByteArrayInputStream(xml.getBytes("utf-8"));
Element element = parseXML(stream);
if (element == null) {
return null;
}
Vertex root = convertElement(element, network);
return root;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Post the XML document object and return the XML data from the URL.
*/
public Vertex postXMLAuth(String url, String user, String password, Vertex xmlObject, String xpath, Network network) {
log("POST XML Auth", Level.INFO, url);
try {
String data = convertToXML(xmlObject);
log("POST XML", Level.FINE, data);
String xml = Utils.httpAuthPOST(url, user, password, "application/xml", data);
log("XML", Level.FINE, xml);
InputStream stream = new ByteArrayInputStream(xml.getBytes("utf-8"));
Element element = parseXML(stream);
if (element == null) {
return null;
}
XPathFactory factory = XPathFactory.newInstance();
XPath path = factory.newXPath();
Object node = path.evaluate(xpath, element, XPathConstants.NODE);
if (node instanceof Element) {
return convertElement((Element)node, network);
} else if (node instanceof Attr) {
return network.createVertex(((Attr)node).getValue());
} else if (node instanceof org.w3c.dom.Text) {
return network.createVertex(((org.w3c.dom.Text)node).getTextContent());
}
return null;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* POST the JSON object and return the JSON data from the URL.
*/
public Vertex postJSON(String url, Vertex jsonObject, Network network) {
log("POST JSON", Level.INFO, url);
try {
String data = convertToJSON(jsonObject);
log("POST JSON", Level.FINE, data);
String json = Utils.httpPOST(url, "application/json", data);
log("JSON", Level.FINE, json);
JSONObject root = (JSONObject)JSONSerializer.toJSON(json);
if (root == null) {
return null;
}
Vertex object = convertElement(root, network);
return object;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* PUT the JSON object and return the JSON data from the URL.
*/
public Vertex putJSON(String url, Vertex jsonObject, Network network) {
log("PUT JSON", Level.INFO, url);
try {
String data = convertToJSON(jsonObject);
log("PUT JSON", Level.FINE, data);
String json = Utils.httpPUT(url, "application/json", data);
log("JSON", Level.FINE, json);
JSONObject root = (JSONObject)JSONSerializer.toJSON(json);
if (root == null) {
return null;
}
Vertex object = convertElement(root, network);
return object;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Post the JSON object and return the JSON data from the URL.
*/
public Vertex postJSONAuth(String url, String user, String password, Vertex jsonObject, Network network) {
log("POST JSON Auth", Level.INFO, url);
try {
String data = convertToJSON(jsonObject);
log("POST JSON", Level.FINE, data);
String json = Utils.httpAuthPOST(url, user, password, "application/json", data);
log("JSON", Level.FINE, json);
JSONObject root = (JSONObject)JSONSerializer.toJSON(json);
if (root == null) {
return null;
}
Vertex object = convertElement(root, network);
return object;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Self API.
* Post the HTML forms params and return the HTML data from the URL.
*/
public Vertex postHTML(Vertex source, Vertex url, Vertex paramsObject, Vertex xpath) {
Network network = source.getNetwork();
return postHTML(url.printString(), paramsObject, xpath.printString(), network);
}
/**
* Post the HTML forms params and return the HTML data from the URL.
*/
public Vertex postHTML(String url, Vertex paramsObject, String xpath, Network network) {
log("POST HTML", Level.INFO, url, xpath);
try {
Map<String, String> data = convertToMap(paramsObject);
log("POST params", Level.FINE, data);
String html = Utils.httpPOST(url, data);
InputStream stream = new ByteArrayInputStream(html.getBytes("utf-8"));
StringReader reader = convertToXHTML(stream);
Element element = parseXHTML(reader);
if (element == null) {
return null;
}
XPathFactory factory = XPathFactory.newInstance();
XPath path = factory.newXPath();
Object node = path.evaluate(xpath, element, XPathConstants.NODE);
if (node instanceof Element) {
return convertElement((Element)node, network);
} else if (node instanceof Attr) {
return network.createVertex(((Attr)node).getValue());
} else if (node instanceof org.w3c.dom.Text) {
return network.createVertex(((org.w3c.dom.Text)node).getTextContent());
}
return null;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Self API.
* Post the XML document object and return the XML data from the URL.
*/
public Vertex postXML(Vertex source, Vertex url, Vertex xmlObject, Vertex xpath) {
Network network = source.getNetwork();
return postXML(url.printString(), xmlObject, xpath.printString(), network);
}
/**
* Self API.
* Post the XML document object and return the XML data from the URL.
*/
public Vertex postXMLAuth(Vertex source, Vertex url, Vertex user, Vertex password, Vertex xmlObject, Vertex xpath) {
Network network = source.getNetwork();
return postXMLAuth(url.printString(), user.printString(), password.printString(), xmlObject, xpath.printString(), network);
}
/**
* Self API.
* Convert the object to a JSON string.
*/
public Vertex toJSON(Vertex source, Vertex jsonObject) {
try {
Network network = source.getNetwork();
String data = convertToJSON(jsonObject);
return network.createVertex(data);
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Self API.
* Convert the object to an XML string.
*/
public Vertex toXML(Vertex source, Vertex xmlObject) {
try {
Network network = source.getNetwork();
String data = convertToXML(xmlObject);
return network.createVertex(data);
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Self API.
* URL encode the string.
*/
public Vertex encode(Vertex source, Vertex text) {
return text.getNetwork().createVertex(org.botlibre.util.Utils.encodeURL(text.printString()));
}
/**
* Post the XML document object and return the XML data from the URL.
*/
public Vertex postXML(String url, Vertex xmlObject, String xpath, Network network) {
log("POST XML", Level.INFO, url, xpath);
try {
String data = convertToXML(xmlObject);
log("POST XML", Level.FINE, data);
String xml = Utils.httpPOST(url, "application/xml", data);
log("XML", Level.FINE, xml);
InputStream stream = new ByteArrayInputStream(xml.getBytes("utf-8"));
Element element = parseXML(stream);
if (element == null) {
return null;
}
XPathFactory factory = XPathFactory.newInstance();
XPath path = factory.newXPath();
Object node = path.evaluate(xpath, element, XPathConstants.NODE);
if (node instanceof Element) {
return convertElement((Element)node, network);
} else if (node instanceof Attr) {
return network.createVertex(((Attr)node).getValue());
} else if (node instanceof org.w3c.dom.Text) {
return network.createVertex(((org.w3c.dom.Text)node).getTextContent());
}
return null;
} catch (Exception exception) {
log(exception);
return null;
}
}
public String convertToXML(Vertex object) {
if (object.hasData()) {
return object.printString();
}
try {
StringWriter writer = new StringWriter();
Vertex root = object.getRelationship(Primitive.ROOT);
String elementName = "root";
if (root != null) {
elementName = root.printString();
}
convertToXML(object, elementName, writer, 0);
return writer.toString();
} catch (Exception exception) {
log(exception);
return null;
}
}
public String convertToJSON(Vertex object) {
try {
StringWriter writer = new StringWriter();
convertToJSON(object, writer, 0);
return writer.toString();
} catch (Exception exception) {
log(exception);
return null;
}
}
public static String printDate(Date date) {
Calendar calendar = Calendar.getInstance();
calendar.setTime(date);
StringWriter writer = new StringWriter();
writer.write(String.valueOf(calendar.get(Calendar.YEAR)));
writer.write("-");
writer.write(String.valueOf(calendar.get(Calendar.MONTH) + 1));
writer.write("-");
writer.write(String.valueOf(calendar.get(Calendar.DATE)));
writer.write("T");
writer.write(String.valueOf(calendar.get(Calendar.HOUR_OF_DAY)));
writer.write(":");
writer.write(String.valueOf(calendar.get(Calendar.MINUTE)));
writer.write(":");
writer.write(String.valueOf(calendar.get(Calendar.SECOND)));
long offset = calendar.getTimeZone().getOffset(calendar.getTimeInMillis());
int offsetHours = (int)(offset / Utils.HOUR);
int offsetMinutes = Math.abs((int)(offset / Utils.MINUTE) - (offsetHours * 60));
if (offsetHours > 0) {
writer.write("+");
} else {
writer.write("-");
}
if (offsetHours < 10) {
writer.write("0");
}
writer.write(String.valueOf(Math.abs(offsetHours)));
writer.write(":");
if (offsetMinutes < 10) {
writer.write("0");
}
writer.write(String.valueOf(offsetMinutes));
return writer.toString();
}
public void convertToJSON(Vertex object, Writer writer, int depth) throws Exception {
if (depth > 100) {
throw new BotException("Max JSON size exceeded");
}
if (object.hasData()) {
writer.write("\"");
if (object.getData() instanceof Timestamp) {
// Use JSON format.
writer.write(printDate((Timestamp)object.getData()));
} else {
writer.write(object.printString());
}
writer.write("\"");
return;
} else {
boolean first = true;
if (object.isArray()) {
writer.write("[");
for (Relationship relationship : object.orderedRelationships(Primitive.ELEMENT)) {
if (first) {
first = false;
} else {
writer.write(", ");
}
convertToJSON(relationship.getTarget(), writer, depth++);
}
writer.write("]");
} else {
writer.write("{");
for (Iterator<Relationship> iterator = object.orderedAllRelationships(); iterator.hasNext(); ) {
Relationship relationship = iterator.next();
if (relationship.isInverse()) {
continue;
}
String name = relationship.getType().getDataValue();
if (!name.equals("instantiation")) {
if (first) {
first = false;
} else {
writer.write(", ");
}
writer.write("\"");
writer.write(name);
writer.write("\":");
convertToJSON(relationship.getTarget(), writer, depth++);
}
}
writer.write("}");
}
}
}
public void convertToXML(Vertex object, String elementName, Writer writer, int depth) throws Exception {
if (depth > 100) {
throw new BotException("Max XML size exceeded");
}
writer.write("<");
writer.write(elementName);
for (Iterator<Relationship> iterator = object.orderedAllRelationships(); iterator.hasNext(); ) {
Relationship relationship = iterator.next();
if (relationship.isInverse()) {
continue;
}
String name = relationship.getType().getDataValue();
if (name.startsWith("@")) {
writer.write(" ");
writer.write(name.substring(1, name.length()));
writer.write("=\"");
writer.write(relationship.getTarget().printString());
writer.write("\"");
}
}
writer.write(">");
if (object.hasData()) {
writer.write(object.printString());
} else {
for (Iterator<Relationship> iterator = object.orderedAllRelationships(); iterator.hasNext(); ) {
Relationship relationship = iterator.next();
if (relationship.isInverse()) {
continue;
}
String name = relationship.getType().getDataValue();
if (!name.startsWith("@") && !name.equals("root") && !name.equals("instantiation")) {
convertToXML(relationship.getTarget(), name, writer, depth++);
}
}
}
writer.write("</");
writer.write(elementName);
writer.write(">");
}
public Map<String, String> convertToMap(Vertex object) {
Map<String, String> map = new HashMap<String, String>();
for (Iterator<Relationship> iterator = object.orderedAllRelationships(); iterator.hasNext(); ) {
Relationship relationship = iterator.next();
if (relationship.isInverse()) {
continue;
}
String name = relationship.getType().getDataValue();
if (!name.equals("instantiation")) {
map.put(name, relationship.getTarget().printString());
}
}
return map;
}
public Vertex convertElement(Element element, Network network) {
try {
if (element == null) {
return null;
}
NamedNodeMap attributes = element.getAttributes();
NodeList list = element.getChildNodes();
if (attributes.getLength() == 0 && list.getLength() == 0) {
return network.createVertex("");
}
if (list.getLength() == 1) {
Node child = list.item(0);
if (child.getNodeType() == Node.TEXT_NODE) {
return network.createVertex(child.getNodeValue());
}
}
Vertex root = network.createVertex();
for (int index = 0; index < attributes.getLength(); index++) {
Node attribute = attributes.item(index);
Primitive key = new Primitive(attribute.getNodeName());
Vertex value = network.createVertex(attribute.getNodeValue());
root.addRelationship(key, value);
}
for (int index = 0; index < list.getLength(); index++) {
Node child = list.item(index);
String name = child.getNodeName();
Primitive key = new Primitive(name);
Vertex value = null;
if (child.getNodeType() == Node.TEXT_NODE) {
value = network.createVertex(child.getNodeValue());
} else if (child instanceof Element) {
value = convertElement((Element)child, network);
} else if (child instanceof CDATASection) {
value = network.createVertex(((CDATASection)child).getNodeValue());
}
if (value != null) {
root.addRelationship(key, value);
}
}
return root;
} catch (Exception exception) {
log(exception);
return null;
}
}
@SuppressWarnings("rawtypes")
public Vertex convertElement(Object json, Network network) {
try {
if (json == null) {
return null;
}
Vertex object = null;
if (json instanceof JSONObject) {
object = network.createVertex();
for (Iterator iterator = ((JSONObject)json).keys(); iterator.hasNext(); ) {
String name = (String)iterator.next();
Object value = ((JSONObject)json).get(name);
if (value == null) {
continue;
}
Primitive key = new Primitive(name);
Vertex target = convertElement(value, network);
object.addRelationship(key, target);
}
} else if (json instanceof JSONArray) {
object = network.createInstance(Primitive.ARRAY);
JSONArray array = (JSONArray)json;
for (int index = 0; index < array.size(); index++) {
Vertex element = convertElement(array.get(index), network);
object.addRelationship(Primitive.ELEMENT, element, index);
}
} else {
object = network.createVertex(json);
}
return object;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Parse the XML as a DOM.
*/
public Element parseXMLURL(URL url) {
try {
InputStream stream = Utils.openStream(url);
return parseXML(stream);
} catch (FileNotFoundException notFound) {
log(notFound.toString(), Level.WARNING);
return null;
} catch (Exception ioException) {
if (getBot().isDebugFine()) {
log(ioException);
} else {
log(ioException.toString(), Bot.WARNING, url);
}
return null;
}
}
/**
* Self API.
* Return the top RSS feed.
*/
public Vertex rss(Vertex source, Vertex url) {
log("RSS", Level.INFO, url);
try {
Network network = source.getNetwork();
List<Map<String, Object>> result = parseRSSFeed(new URL(url.printString()), 0);
if (result == null) {
return null;
}
for (Map<String, Object> element : result) {
Vertex rss = network.createInstance(Primitive.RSS);
for (Entry<String, Object> entry : element.entrySet()) {
rss.addRelationship(new Primitive(entry.getKey()), network.createVertex(entry.getValue()));
}
return rss;
}
return null;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Self API.
* Return the entire RSS feed.
*/
public Vertex rssFeed(Vertex source, Vertex url) {
log("RSS feed", Level.INFO, url);
try {
Network network = source.getNetwork();
List<Map<String, Object>> result = parseRSSFeed(new URL(url.printString()), 0);
Vertex list = network.createInstance(Primitive.ARRAY);
int index = 0;
for (Map<String, Object> element : result) {
Vertex rss = network.createInstance(Primitive.RSS);
for (Entry<String, Object> entry : element.entrySet()) {
rss.addRelationship(new Primitive(entry.getKey()), network.createVertex(entry.getValue()));
}
list.addRelationship(Primitive.ELEMENT, rss, index);
index++;
}
return list;
} catch (Exception exception) {
log(exception);
return null;
}
}
/**
* Parse RSS feed.
*/
public List<Map<String, Object>> parseRSSFeed(URL url, long fromTime) {
try {
Element root = parseXMLURL(url);
List<Map<String, Object>> feed = new ArrayList<Map<String, Object>>();
if (root == null) {
return null;
}
NodeList list = root.getElementsByTagName("entry");
// There are several RSS feed formats.
if ((list != null) && (list.getLength() > 0)) {
// Blogger feed.
for (int index = 0; index < list.getLength(); index++) {
Element entry = (Element)list.item(index);
Map<String, Object> map = new HashMap<String, Object>(4);
NodeList children = entry.getElementsByTagName("published");
if ((children != null) && (children.getLength() > 0)) {
String date = children.item(0).getTextContent();
long time = System.currentTimeMillis();
try {
time = Utils.parseDate(date, "yyyy-MM-dd'T'HH:mm:ss.SSS").getTimeInMillis();
} catch (Exception exception) {
try {
time = Utils.parseDate(date, "yyyy-MM-dd'T'HH:mm:ssX").getTimeInMillis();
} catch (Exception exception2) {
try {
time = Utils.parseDate(date, "EEE, dd MMM yyyy HH:mm:ss X").getTimeInMillis();
} catch (Exception exception3) {
try {
time = Utils.parseDate(date, "EEE, dd MMM yyyy").getTimeInMillis();
} catch (Exception exception4) {
log(exception);
}
}
}
}
if (time <= fromTime) {
break;
}
map.put("published", time);
} else {
continue;
}
children = entry.getElementsByTagName("title");
if ((children != null) && (children.getLength() > 0)) {
map.put("title", children.item(0).getTextContent());
} else {
continue;
}
children = entry.getElementsByTagName("content");
if ((children != null) && (children.getLength() > 0)) {
map.put("content", children.item(0).getTextContent());
} else {
continue;
}
NodeList links = entry.getElementsByTagName("link");
for (int index2 = 0; index2 < links.getLength(); index2++) {
Element link = (Element)links.item(index2);
String rel = link.getAttribute("rel");
if ((rel != null) && rel.equals("alternate")) {
map.put("link", link.getAttribute("href"));
}
}
feed.add(map);
}
} else {
list = root.getElementsByTagName("channel");
if ((list != null) && (list.getLength() > 0)) {
list = ((Element)list.item(0)).getElementsByTagName("item");
if ((list != null) && (list.getLength() > 0)) {
// Standard feed.
for (int index = 0; index < list.getLength(); index++) {
Element entry = (Element)list.item(index);
Map<String, Object> map = new HashMap<String, Object>(3);
NodeList children = entry.getElementsByTagName("pubDate");
if ((children != null) && (children.getLength() > 0)) {
String date = children.item(0).getTextContent();
long time = System.currentTimeMillis();
try {
time = Utils.parseDate(date, "yyyy-MM-dd'T'HH:mm:ss.SSS").getTimeInMillis();
} catch (Exception exception) {
try {
time = Utils.parseDate(date, "yyyy-MM-dd'T'HH:mm:ssX").getTimeInMillis();
} catch (Exception exception2) {
try {
time = Utils.parseDate(date, "EEE, dd MMM yyyy HH:mm:ss X").getTimeInMillis();
} catch (Exception exception3) {
try {
time = Utils.parseDate(date, "EEE, dd MMM yyyy").getTimeInMillis();
} catch (Exception exception4) {
log(exception);
}
}
}
}
if (time <= fromTime) {
break;
}
map.put("published", time);
} else {
continue;
}
children = entry.getElementsByTagName("title");
if ((children != null) && (children.getLength() > 0)) {
map.put("title", children.item(0).getTextContent());
} else {
continue;
}
children = entry.getElementsByTagName("link");
if ((children != null) && (children.getLength() > 0)) {
map.put("link", children.item(0).getTextContent());
} else {
continue;
}
feed.add(map);
}
}
}
}
return feed;
} catch (Exception ioException) {
log(ioException.getMessage(), Level.WARNING, url);
return null;
}
}
/**
* Process the list of URLs as a batch using multi threading.
*/
public void input(Collection<URL> input) {
Queue<URL> urls = new ConcurrentLinkedQueue<URL>();
urls.addAll(input);
WorkerThread worker = new WorkerThread();
worker.urls = urls;
Thread threads[] = new Thread[WORKER_THREADS];
// Process a couple to avoid contention on common data.
Network memory = getBot().memory().newMemory();
URL url = urls.poll();
batchProcessURL(url, memory);
url = urls.poll();
batchProcessURL(url, memory);
for (int index = 0; index < WORKER_THREADS; index++) {
threads[index] = new Thread(worker);
threads[index].start();
}
boolean alive = true;
while (alive) {
// Check if all of the workers are done.
for (int index = 0; index < WORKER_THREADS; index++) {
if (threads[index].isAlive()) {
try {
Thread.sleep(10);
} catch (InterruptedException exception) {
log(exception);
}
break;
} else if (index == (WORKER_THREADS - 1)) {
alive = false;
}
}
}
}
/**
* Process the XHTML DOM.
* This should extract the useful context from the page.
* This should normally be overridden by a subclass to process a specific type of page, i.e. Wikipedia entry.
*/
public void processRoot(Node node, URL url, Network network) {
Vertex vertex = createURL(url, network);
processHeaders(node, vertex, network);
}
/**
* Process the header nodes and associate their topics with the URL.
*/
public void processHeaders(Node node, Vertex url, Network network) {
// Find the main header, and associate the url with it.
Set<String> headers = new HashSet<String>(3);
headers.add("h1");
headers.add("h2");
headers.add("h3");
headers.add("h4");
Node header = findTag(headers, null, node);
Vertex urlType = network.createVertex(Primitive.URL);
Vertex topicType = network.createVertex(Primitive.TOPIC);
Vertex instantiationType = network.createVertex(Primitive.INSTANTIATION);
Vertex sentenceType = network.createVertex(Primitive.SENTENCE);
Vertex contentType = network.createVertex(Primitive.CONTENT);
Vertex h1 = null;
Vertex h2 = null;
while (header != null) {
log("Header", Bot.FINE, header);
Vertex topic = network.createVertex();
topic.addRelationship(instantiationType, topicType);
Vertex sentence = getSentence(header, network);
topic.setName(sentence.getDataValue());
log("Topic", Bot.FINE, sentence);
topic.addRelationship(sentenceType, sentence);
sentence.addRelationship(topicType, topic);
if (header.getNodeName().equals("h1")) {
topic.addRelationship(urlType, url);
url.addRelationship(topicType, topic);
h1 = topic;
} else if (header.getNodeName().equals("h2")) {
if (h1 != null) {
h1.addRelationship(contentType, topic);
topic.addRelationship(topicType, h1);
}
h2 = topic;
} else if (header.getNodeName().equals("h3")) {
if (h2 != null) {
h2.addRelationship(contentType, topic);
topic.addRelationship(topicType, h2);
} else if (h1 != null) {
h1.addRelationship(contentType, topic);
topic.addRelationship(topicType, h1);
}
}
// Need to walk back up to parent if no more siblings.
header = findNextTag(headers, null, header, node);
}
network.save();
getBot().memory().addActiveMemory(url);
}
/**
* Return the next sibling or parent sibling node.
* Only walk up to root at most.
*/
public Node nextNode(Node node, Node root) {
if (node == null) {
return null;
}
// Need to walk back up to parent if no more siblings.
Node nextNode = node.getNextSibling();
Node parent = node.getParentNode();
while ((nextNode == null) && (parent != null)) {
if (parent == root) {
return null;
}
nextNode = parent.getNextSibling();
parent = parent.getParentNode();
}
return nextNode;
}
/**
* Find the next node for the tag, search children, siblings and cousins.
* Only walk up to root at most.
*/
public Node findNextTag(Set<String> tags, String value, Node node, Node root) {
Node header = node;
Node nextNode = nextNode(header, root);
header = findTag(tags, null, nextNode);
while ((header == null) && (nextNode != null)) {
nextNode = nextNode(nextNode, root);
header = findTag(tags, null, nextNode);
}
return header;
}
/**
* Find the next node for the tag.
*/
public Node findTag(String tag, Node node) {
return findTag(tag, null, node);
}
/**
* Find the next node for the tag.
*/
public Node findTag(String tag, String value, Node node) {
Set<String> tags = new HashSet<String>(1);
tags.add(tag);
return findTag(tags, value, node);
}
/**
* Find the next node for any of the tags whose text contains the value.
*/
public Node findTag(Set<String> tags, String value, Node node) {
if (node == null) {
return null;
}
Node nextNode = node;
while (!tags.contains(nextNode.getNodeName())
|| ((value != null) && (nextNode.getTextContent().indexOf(value) == -1))) {
NodeList nodes = nextNode.getChildNodes();
// Only need to process first child, as it will process siblings.
if (nodes.getLength() > 0) {
Node child = findTag(tags, value, nodes.item(0));
if (child != null) {
return child;
}
}
nextNode = nextNode.getNextSibling();
if (nextNode == null) {
return null;
}
}
return nextNode;
}
/**
* Return the complete node text.
*/
public String getText(Node node) {
String text = "";
NodeList nodes = node.getChildNodes();
for (int index = 0; index < nodes.getLength(); index++) {
Node child = nodes.item(index);
text = text + child.getTextContent();
}
return text.trim();
}
/**
* Return a sentence of all the words, or a word is a single word.
*/
public Vertex getSentence(Node node, Network network) {
String text = getText(node);
return network.createSentence(text);
}
public String stripBrackets(String text) {
TextStream stream = new TextStream(text);
text = stream.upTo('(').trim();
while (text.isEmpty() && !stream.atEnd()) {
stream.skipTo(')');
if (stream.peek() == ':') {
stream.skip();
}
text = stream.upTo('(').trim();
}
return text;
}
/**
* Parse the text values from the next paragrpah.
*/
public String getNextParagraph(Node node) {
Node p = findTag("p", node);
if (p == null) {
return "";
}
return p.getTextContent();
}
/**
* Parse the text values from the next bullet list.
*/
public List<String> getNextBulletList(Node node) {
List<String> words = new ArrayList<String>();
Node ul = findTag("ul", node);
if (ul != null) {
NodeList nodes = ul.getChildNodes();
for (int index = 0; index < nodes.getLength(); index++) {
Node child = nodes.item(index);
if (child.getNodeName().equals("li")) {
String text = child.getTextContent().trim();
words.add(text);
}
}
}
return words;
}
/**
* Parse the text values from the next numbered list.
*/
public List<String> getNextNumberedList(Node node) {
List<String> words = new ArrayList<String>();
Node ul = findTag("ol", node);
if (ul != null) {
NodeList nodes = ul.getChildNodes();
for (int index = 0; index < nodes.getLength(); index++) {
Node child = nodes.item(index);
if (child.getNodeName().equals("li")) {
String text = child.getTextContent().trim();
words.add(text);
}
}
}
return words;
}
/**
* Parse the text values from the next bullet list.
*/
public List<String> getAllBullets(Node node) {
List<String> words = new ArrayList<String>();
Set<String> tags = new HashSet<String>(1);
tags.add("ul");
Node ul = findNextTag(tags, null, node, node.getParentNode());
while (ul != null) {
NodeList nodes = ul.getChildNodes();
for (int index = 0; index < nodes.getLength(); index++) {
Node child = nodes.item(index);
String text = child.getTextContent().trim();
words.add(text);
}
ul = findNextTag(tags, null, ul, node.getParentNode());
}
return words;
}
/**
* Parse the text values from the next bullet list.
*/
public List<String> getAllURLBullets(Node node) {
List<String> urls = new ArrayList<String>();
Set<String> tags = new HashSet<String>(1);
tags.add("ul");
Node ul = findNextTag(tags, null, node, node.getParentNode());
while (ul != null) {
NodeList nodes = ul.getChildNodes();
for (int index = 0; index < nodes.getLength(); index++) {
Node child = nodes.item(index);
NodeList chilren = child.getChildNodes();
for (int childIndex = 0; childIndex < chilren.getLength(); childIndex++) {
Node url = chilren.item(childIndex);
if (url.getNodeName().equals("a")) {
urls.add(url.getAttributes().getNamedItem("href").getTextContent());
}
}
}
ul = findNextTag(tags, null, ul, node.getParentNode());
}
return urls;
}
/**
* Create the URL vertex.
*/
protected Vertex createURL(URL url, Network network) {
try {
Vertex vertex = network.createVertex(url.toURI());
vertex.addRelationship(Primitive.INSTANTIATION, Primitive.URL);
return vertex;
} catch (URISyntaxException exception) {
throw new RuntimeException(exception);
}
}
/**
* Post, process the post request.
*/
@Override
public void output(Vertex output) {
}
/**
* Return the map of registered domain processing senses,
* keyed by their URL domains they accept.
*/
public Map<String, Http> getDomains() {
return domains;
}
}