/**
*
*/
package net.yacy.document.parser.rdfa.impl;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashSet;
import java.util.Set;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.rdfa.IRDFaTriple;
/**
* @author fgandon
*
*/
public class RDFaParser extends AbstractParser implements Parser {
private final htmlParser hp;
public RDFaParser() {
super("RDFa Parser");
this.hp = new htmlParser();
this.SUPPORTED_EXTENSIONS.add("html");
this.SUPPORTED_EXTENSIONS.add("htm");
this.SUPPORTED_EXTENSIONS.add("xhtml");
this.SUPPORTED_EXTENSIONS.add("php");
this.SUPPORTED_MIME_TYPES.add("text/html");
this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
}
@Override
public Document[] parse(
final DigestURL url,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = parseHtml(url, mimeType, charset, scraper, timezoneOffset, source);
// TODO: current hardcoded restriction: apply rdfa parser only on selected sources.
if (url.toNormalform(true).contains(".yacy") || url.toNormalform(true).contains("experiments")) {
// if (true == false) {
Document rdfaDoc = parseRDFa(url, mimeType, charset, source);
Document[] retDocs = new Document[htmlDocs.length + 1];
for (int i = 0; i < htmlDocs.length; i++) {
retDocs[i] = htmlDocs[i];
}
retDocs[retDocs.length - 1] = rdfaDoc;
return retDocs;
}
return htmlDocs;
}
private static Document parseRDFa(DigestURL url, String mimeType,
String charset, InputStream source) {
RDFaTripleImpl triple;
IRDFaTriple[] allTriples = null;
try {
triple = new RDFaTripleImpl(new InputStreamReader(source), url
.toString());
allTriples = triple.parse();
} catch (final Exception e) {
ConcurrentLog.warn("RDFA PARSER", "Triple extraction failed");
}
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, null, null, null, null, false, new Date());
try {
if (allTriples.length > 0)
doc = convertAllTriplesToDocument(url, mimeType, charset,
allTriples);
} catch (final Exception e) {
ConcurrentLog.warn("RDFA PARSER",
"Conversion triple to document failed");
}
return doc;
}
private Document[] parseHtml(
final DigestURL url,
final String mimeType,
final String charset,
final VocabularyScraper scraper,
final int timezoneOffset,
final InputStream source) throws Failure,
InterruptedException {
Document[] htmlDocs = null;
try {
htmlDocs = this.hp.parse(url, mimeType, charset, scraper, timezoneOffset, source);
source.reset();
} catch (final IOException e1) {
ConcurrentLog.warn("RDFA PARSER", "Super call failed");
}
return htmlDocs;
}
private static Document convertAllTriplesToDocument(DigestURL url,
String mimeType, String charset, IRDFaTriple[] allTriples) {
//Set<String> languages = new HashSet<String>(2);
Set<String> keywords = new HashSet<String>(allTriples.length);
//Set<String> sections = new HashSet<String>(5);
String all = "";
for (IRDFaTriple irdFaTriple : allTriples) {
// addNotEmptyValuesToSet(keywords, irdFaTriple.getLanguage());
// addNotEmptyValuesToSet(keywords,
// irdFaTriple.getSubjectNodeURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getSubjectURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getPropertyURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getObjectNodeURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getObjectURI());
// addNotEmptyValuesToSet(keywords, irdFaTriple.getValue());
addNotEmptyValuesToSet(keywords, irdFaTriple.getPropertyURI() + "Z"
+ irdFaTriple.getValue());
}
for (String string : keywords) {
string = string.replace(":", "X");
string = string.replace("_", "Y");
string = string.replace(" ", "Y");
string = string.replace(".", "Y");
string = string.replace(",", "Y");
all += string + ",";
}
Document doc = new Document(url, mimeType, charset, null, null, null, singleList(""), "",
"", null, new ArrayList<String>(0), 0, 0, all, null, null, null, false, new Date());
return doc;
}
private static void addNotEmptyValuesToSet(Set<String> set, String value) {
if (value != null) {
set.add(value);
}
}
public static void main(String[] args) {
URL aURL = null;
if (args.length < 1) {
System.out.println("Usage: one and only one argument giving a file path or a URL.");
} else {
File aFile = new File(args[0]);
Reader aReader = null;
if (aFile.exists()) {
try {
aReader = new FileReader(aFile);
} catch (final FileNotFoundException e) {
aReader = null;
}
} else {
try {
aURL = new URL(args[0]);
aReader = new InputStreamReader(aURL.openStream());
} catch (final MalformedURLException e) {
} catch (final IOException e) {
e.printStackTrace();
aReader = null;
}
}
if (aReader != null) {
RDFaParser aParser = new RDFaParser();
try {
aParser.parse(new DigestURL(args[0]), "", "", new VocabularyScraper(), 0, aURL.openStream());
} catch (final FileNotFoundException e) {
e.printStackTrace();
} catch (final IOException e) {
e.printStackTrace();
} catch (final Failure e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (final InterruptedException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else
System.out.println("File or URL not recognized.");
}
}
}