// htmlFilterOutputStream.java // --------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2004, 2005 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // This program is free software; you can redistribute it and/or modify // it under the terms of the GNU General Public License as published by // the Free Software Foundation; either version 2 of the License, or // (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU General Public License for more details. // // You should have received a copy of the GNU General Public License // along with this program; if not, write to the Free Software // Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /* This class implements an output stream. Any data written to that output is automatically parsed. After finishing with writing, the htmlFilter can be read out. */ package net.yacy.document.parser.html; import java.io.IOException; import java.io.OutputStream; import java.io.OutputStreamWriter; import java.io.Writer; import java.nio.charset.Charset; import java.util.Enumeration; import java.util.Properties; import java.util.Stack; import net.yacy.document.parser.html.ContentScraper.TagName; import net.yacy.kelondro.io.CharBuffer; public final class TransformerWriter extends Writer { public static final char lb = '<'; public static final char rb = '>'; public static final char dash = '-'; public static final char excl = '!'; public static final char singlequote = '\''; public static final char doublequote = '"'; private final OutputStream outStream; private OutputStreamWriter out; private CharBuffer buffer; private Stack<ContentScraper.Tag> tagStack; private final Scraper scraper; private final Transformer transformer; private boolean inSingleQuote; private boolean inDoubleQuote; private boolean inComment; private boolean binaryUnsuspect; private final boolean passbyIfBinarySuspect; public TransformerWriter( final OutputStream outStream, final Charset charSet, final Scraper scraper, final Transformer transformer, final boolean passbyIfBinarySuspect ) { this(outStream, charSet, scraper, transformer, passbyIfBinarySuspect, 64); } public TransformerWriter( final OutputStream outStream, final Charset charSet, final Scraper scraper, final Transformer transformer, final boolean passbyIfBinarySuspect, final int initialBufferSize ) { this.outStream = outStream; this.scraper = scraper; this.transformer = transformer; this.buffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, initialBufferSize); this.tagStack = new Stack<ContentScraper.Tag>(); this.inSingleQuote = false; this.inDoubleQuote = false; this.inComment = false; this.binaryUnsuspect = true; this.passbyIfBinarySuspect = passbyIfBinarySuspect; if (this.outStream != null) { this.out = new OutputStreamWriter(this.outStream,(charSet == null)?Charset.defaultCharset():charSet); } } public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) { final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + tagopts.length + 3); bb.append('<'); if (!opening) { bb.append('/'); } bb.append(tagname); if (tagopts.length > 0) { // if (tagopts[0] == (byte) 32) bb.append(tagopts); // else bb.append((byte) 32).append(tagopts); } bb.append('>'); final char[] result = bb.getChars(); bb.close(); return result; } public static char[] genTag1raw(final String tagname, final char[] tagopts, final char[] text) { final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, 2 * tagname.length() + tagopts.length + text.length + 5); bb.append('<').append(tagname); if (tagopts.length > 0) { // if (tagopts[0] == (byte) 32) bb.append(tagopts); // else bb.append((byte) 32).append(tagopts); } bb.append('>'); bb.append(text); bb.append('<').append('/').append(tagname).append('>'); final char[] result = bb.getChars(); bb.close(); return result; } public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) { final char[] tagoptsx = (tagopts.isEmpty()) ? null : genOpts(tagopts, quotechar); final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2); bb.append('<').append(tagname); if (tagoptsx != null) { bb.appendSpace(); bb.append(tagoptsx); } bb.append('>'); final char[] result = bb.getChars(); bb.close(); return result; } public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) { final char[] gt0 = genTag0(tagname, tagopts, quotechar); final CharBuffer cb = new CharBuffer(ContentScraper.MAX_DOCSIZE, gt0, gt0.length + text.length + tagname.length() + 3); cb.append(text).append('<').append('/').append(tagname).append('>'); final char[] result = cb.getChars(); cb.close(); return result; } // a helper method for pretty-printing of properties for html tags public static char[] genOpts(final Properties prop, final char quotechar) { final Enumeration<?> e = prop.propertyNames(); final CharBuffer bb = new CharBuffer(ContentScraper.MAX_DOCSIZE, prop.size() * 40); String key; while (e.hasMoreElements()) { key = (String) e.nextElement(); bb.appendSpace().append(key).append('=').append(quotechar); bb.append(prop.getProperty(key)); bb.append(quotechar); } final char[] result; if (bb.length() > 0) result = bb.getChars(1); else result = bb.getChars(); bb.close(); return result; } /** * the token processor distinguishes three different types of input: opening tag, closing tag, text content * @param in - the token to be processed * @param quotechar * @return a processed version of the token */ private char[] tokenProcessor(final char[] in, final char quotechar) { if (in.length == 0) return in; // scan the string and parse structure if (in.length <= 2 || in[0] != lb) return filterTag(in); // this is a text // this is a tag String tag; int tagend; if (in[1] == '/') { // a closing tag tagend = tagEnd(in, 2); tag = new String(in, 2, tagend - 2).toLowerCase(); final char[] text = new char[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(text, quotechar, tag, false); } // don't add text from within <script> section, here e.g. a "if 1<a" expression could confuse tag detection if (this.tagStack.size()>0 && this.tagStack.lastElement().name.equals(TagName.script.name())) { return new char[0]; } // an opening tag tagend = tagEnd(in, 1); tag = new String(in, 1, tagend - 1).toLowerCase(); final char[] text = new char[in.length - tagend - 1]; System.arraycopy(in, tagend, text, 0, in.length - tagend - 1); return filterTag(text, quotechar, tag, true); } // distinguish the following cases: // - (1) not collecting data for a tag and getting no tag (not opener and not close) // - (2) not collecting data for a tag and getting a tag opener // - (3) not collecting data for a tag and getting a tag close // - (4) collecting data for a tag and getting no tag (not opener and not close) // - (5) collecting data for a tag and getting a new/different tag opener without closing the previous tag // - (6) collecting data for a tag and getting a tag close for the wrong tag (a different than the opener) // - (7) collecting data for a tag and getting the correct close tag for that collecting tag /** * * @param content * @return content or empty array */ private char[] filterTag(final char[] content) { if (this.tagStack.size() == 0) { // we are not collection tag text -> case (1) - (3) // case (1): this is not a tag opener/closer if (this.scraper != null && content.length > 0) this.scraper.scrapeText(content, null); if (this.transformer != null) return this.transformer.transformText(content); return content; } // we are collection tag text for the tag 'filterTag' -> case (4) - (7) // case (4): getting no tag, go on collecting content if (this.scraper != null) { this.scraper.scrapeText(content, this.tagStack.lastElement().name); } if (this.transformer != null) { this.tagStack.lastElement().content.append(this.transformer.transformText(content)); } else { this.tagStack.lastElement().content.append(content); } return new char[0]; } private char[] filterTag(final char[] content, final char quotechar, final String tagname, final boolean opening) { assert tagname != null; if (this.tagStack.size() == 0) { // we are not collection tag text -> case (1) - (3) // we have a new tag if (opening) { // case (2): return filterTagOpening(tagname, content, quotechar); } // its a close tag where no should be // case (3): we ignore that thing and return it again return genTag0raw(tagname, false, content); } // we are collection tag text for the tag 'filterTag' -> case (4) - (7) if (tagname.equals("!")) filterTag(content); // it's a tag! which one? if (opening) { // case (5): the opening should not be here. But we keep the order anyway this.tagStack.lastElement().content.append(filterTagOpening(tagname, content, quotechar)); return new char[0]; } if (!tagname.equalsIgnoreCase(this.tagStack.lastElement().name)) { // case (6): its a closing tag, but the wrong one. just add it. this.tagStack.lastElement().content.append(genTag0raw(tagname, opening, content)); return new char[0]; } // it's our closing tag! return complete result. return filterTagCloseing(quotechar); } private char[] filterTagOpening(final String tagname, final char[] content, final char quotechar) { final CharBuffer charBuffer = new CharBuffer(ContentScraper.MAX_DOCSIZE, content); ContentScraper.Tag tag = new ContentScraper.Tag(tagname, charBuffer.propParser()); charBuffer.close(); if (this.scraper != null && this.scraper.isTag0(tagname)) { // this single tag is collected at once here this.scraper.scrapeTag0(tag); } if (this.transformer != null && this.transformer.isTag0(tagname)) { // this single tag is collected at once here char[] b = this.transformer.transformTag0(tag, quotechar); return b; } else if ((this.scraper != null && this.scraper.isTag1(tagname)) || (this.transformer != null && this.transformer.isTag1(tagname))) { // ok, start collecting; we don't push this here to the scraper or transformer; we do that when the tag is closed. this.tagStack.push(tag); return new char[0]; } else { // we ignore that thing and return it again return genTag0raw(tagname, true, content); } } private char[] filterTagCloseing(final char quotechar) { char[] ret; ContentScraper.Tag tag = this.tagStack.lastElement(); if (this.scraper != null) this.scraper.scrapeTag1(tag); if (this.transformer != null) { ret = this.transformer.transformTag1(tag, quotechar); } else { ret = genTag1(tag.name, tag.opts, tag.content.getChars(), quotechar); } if ((this.scraper != null && this.scraper.isTag1(tag.name)) || (this.transformer != null && this.transformer.isTag1(tag.name))) { // remove the tag from the stack as soon as the tag is processed this.tagStack.pop(); // at this point the characters from the recently processed tag must be attached to the previous tag if (this.tagStack.size() > 0) this.tagStack.lastElement().content.append(ret); } return ret; } private char[] filterFinalize(final char quotechar) { if (this.tagStack.size() == 0) { return new char[0]; } // it's our closing tag! return complete result. char[] ret; if (this.scraper != null) this.scraper.scrapeTag1(this.tagStack.lastElement()); if (this.transformer != null) { ret = this.transformer.transformTag1(this.tagStack.lastElement(), quotechar); } else { ret = genTag1(this.tagStack.lastElement().name, this.tagStack.lastElement().opts, this.tagStack.lastElement().content.getChars(), quotechar); } this.tagStack.pop(); return ret; } private static int tagEnd(final char[] tag, final int start) { char c; for (int i = start; i < tag.length; i++) { c = tag[i]; if (c != '!' && c != '-' && (c < '0' || c > '9') && (c < 'a' || c > 'z') && (c < 'A' || c > 'Z') ) return i; } return tag.length - 1; } /** * this is the tokenizer of the parser: it splits the input into pieces which are * - quoted text parts * - commented text parts * - tags (opening and closing) * - text content between all these parts * The tokens are then parsed with the filterSentence method */ @Override public void write(final int c) throws IOException { //System.out.println((char) c); if ((this.binaryUnsuspect) && (binaryHint((char)c))) { this.binaryUnsuspect = false; if (this.passbyIfBinarySuspect) close(); } if (this.binaryUnsuspect || !this.passbyIfBinarySuspect) { char[] filtered; if (this.inSingleQuote) { this.buffer.append(c); if (c == singlequote) this.inSingleQuote = false; // check error cases if ((c == rb) && (this.buffer.length() > 0 && this.buffer.charAt(0) == lb)) { this.inSingleQuote = false; // the tag ends here. after filtering: pass on filtered = tokenProcessor(this.buffer.getChars(), singlequote); if (this.out != null) { this.out.write(filtered); } // this.buffer = new serverByteBuffer(); this.buffer.reset(); } } else if (this.inDoubleQuote) { this.buffer.append(c); if (c == doublequote) this.inDoubleQuote = false; // check error cases if (c == rb && this.buffer.length() > 0 && this.buffer.charAt(0) == lb) { this.inDoubleQuote = false; // the tag ends here. after filtering: pass on filtered = tokenProcessor(this.buffer.getChars(), doublequote); if (this.out != null) this.out.write(filtered); // this.buffer = new serverByteBuffer(); this.buffer.reset(); } } else if (this.inComment) { this.buffer.append(c); if (c == rb && this.buffer.length() > 6 && this.buffer.charAt(this.buffer.length() - 3) == dash) { // comment is at end this.inComment = false; final char[] comment = this.buffer.getChars(); if (this.scraper != null) this.scraper.scrapeComment(comment); if (this.out != null) this.out.write(comment); // this.buffer = new serverByteBuffer(); this.buffer.reset(); } } else { if (this.buffer.isEmpty()) { if (c == rb) { // very strange error case; we just let it pass if (this.out != null) this.out.write(c); } else { this.buffer.append(c); } } else if (this.buffer.length() > 0 && this.buffer.charAt(0) == lb) { if (c == singlequote) this.inSingleQuote = true; if (c == doublequote) this.inDoubleQuote = true; // fill in tag text if ((this.buffer.length() >= 3) && (this.buffer.charAt(1) == excl) && (this.buffer.charAt(2) == dash) && (c == dash)) { // this is the start of a comment this.inComment = true; this.buffer.append(c); } else if (c == rb) { this.buffer.append(c); // the tag ends here. after filtering: pass on filtered = tokenProcessor(this.buffer.getChars(), doublequote); if (this.out != null) this.out.write(filtered); // this.buffer = new serverByteBuffer(); this.buffer.reset(); } else if (c == lb) { // this is an error case // we consider that there is one rb missing if (this.buffer.length() > 0) { filtered = tokenProcessor(this.buffer.getChars(), doublequote); if (this.out != null) this.out.write(filtered); } // this.buffer = new serverByteBuffer(); this.buffer.reset(); this.buffer.append(c); } else { this.buffer.append(c); } } else { // fill in plain text if (c == lb) { // the text ends here if (this.buffer.length() > 0) { filtered = tokenProcessor(this.buffer.getChars(), doublequote); if (this.out != null) this.out.write(filtered); } // this.buffer = new serverByteBuffer(); this.buffer.reset(); this.buffer.append(c); } else { // simply append this.buffer.append(c); } } } } else { this.out.write(c); } } @Override public void write(final char b[]) throws IOException { write(b, 0, b.length); } @Override public void write(final char b[], final int off, final int len) throws IOException { // System.out.println(UTF8.String(b, off, len)); if ((off | len | (b.length - (len + off)) | (off + len)) < 0) throw new IndexOutOfBoundsException(); for (int i = off ; i < (len - off) ; i++) this.write(b[i]); } @Override public void flush() throws IOException { // we cannot flush the current string this.buffer to prevent that // the filter process is messed up // instead, we simply flush the underlying output stream if (this.out != null) this.out.flush(); if (this.scraper != null) this.scraper.finish(); // if you want to flush all, call close() at end of writing; } @Override public void close() throws IOException { flush(); final char quotechar = (this.inSingleQuote) ? singlequote : doublequote; if (this.buffer != null) { if (this.buffer.length() > 0) { final char[] filtered = tokenProcessor(this.buffer.getChars(), quotechar); if (this.out != null) this.out.write(filtered); } this.buffer.close(); this.buffer = null; } final char[] finalized = filterFinalize(quotechar); if (this.out != null) { if (finalized != null) this.out.write(finalized); this.out.flush(); this.out.close(); } this.tagStack.clear(); this.tagStack = null; if (this.scraper != null) this.scraper.finish(); } private static boolean binaryHint(final char c) { // space, punctiation and symbols, letters and digits (ASCII/latin) //if (c >= 31 && c < 128) return false; if(c > 31) return false; // 8 = backspace // 9 = horizontal tab // 10 = new line (line feed) // 11 = vertical tab // 12 = new page (form feed) // 13 = carriage return if (c > 7 && c <= 13) return false; //if (Character.isLetterOrDigit(c)) return false; // return false; // System.err.println("BINARY HINT: " + (int) c); return true; } public boolean binarySuspect() { return !this.binaryUnsuspect; } }