package eu.dnetlib.iis.wf.ingest.html; import java.io.IOException; import org.apache.avro.mapred.AvroKey; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.Mapper; import org.apache.log4j.Logger; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.safety.Whitelist; import eu.dnetlib.iis.metadataextraction.schemas.DocumentText; /** * Module ingesting plain text from HTML document. * * @author mhorst */ public class HtmlToPlaintextIngester extends Mapper<AvroKey<DocumentText>, NullWritable, AvroKey<DocumentText>, NullWritable> { private static final Logger log = Logger.getLogger(HtmlToPlaintextIngester.class); private static final Document.OutputSettings outputSettings = new Document.OutputSettings().prettyPrint(false); @Override protected void map(AvroKey<DocumentText> key, NullWritable value, Context context) throws IOException, InterruptedException { DocumentText htmlText = key.datum(); final DocumentText.Builder output = DocumentText.newBuilder(); output.setId(htmlText.getId()); try { // preserving newlines output.setText(cleanNoMarkup(htmlText.getText().toString())); context.write(new AvroKey<DocumentText>(output.build()), NullWritable.get()); } catch (Exception e) { log.error("exception thrown when trying to extract text representation " + "from html document identified with: " + htmlText.getId(), e); } } private static String cleanNoMarkup(String input) { return Jsoup.clean(input, "", Whitelist.none(), outputSettings).replace(" ", ""); } }