// AbstractScraper.java // --------------------------- // (C) by Michael Peter Christen; mc@yacy.net // first published on http://www.anomic.de // Frankfurt, Germany, 2004 // // $LastChangedDate$ // $LastChangedRevision$ // $LastChangedBy$ // // You agree that the Author(s) is (are) not responsible for cost, // loss of data or any harm that may be caused by usage of this softare or // this documentation. The usage of this software is on your own risk. The // installation and usage (starting/running) of this software may allow other // people or application to access your computer and any attached devices and // is highly dependent on the configuration of the software which must be // done by the user of the software;the author(s) is (are) also // not responsible for proper configuration and usage of the software, even // if provoked by documentation provided together with the software. // // THE SOFTWARE THAT FOLLOWS AS ART OF PROGRAMMING BELOW THIS SECTION // IS PUBLISHED UNDER THE GPL AS DOCUMENTED IN THE FILE gpl.txt ASIDE THIS // FILE AND AS IN http://www.gnu.org/licenses/gpl.txt // ANY CHANGES TO THIS FILE ACCORDING TO THE GPL CAN BE DONE TO THE // LINES THAT FOLLOWS THIS COPYRIGHT NOTICE HERE, BUT CHANGES MUST NOT // BE DONE ABOVE OR INSIDE THE COPYRIGHT NOTICE. A RE-DISTRIBUTION // MUST CONTAIN THE INTACT AND UNCHANGED COPYRIGHT NOTICE. // CONTRIBUTIONS AND CHANGES TO THE PROGRAM CODE SHOULD BE MARKED AS SUCH. package net.yacy.document.parser.html; import java.util.Set; import net.yacy.kelondro.util.MemoryControl; public abstract class AbstractScraper implements Scraper { protected static final String EMPTY_STRING = new String(); public static final char sp = ' '; public static final char lb = '<'; public static final char rb = '>'; public static final char sl = '/'; private Set<String> tags0; private Set<String> tags1; /** * create a scraper. the tag sets must contain tags in lowercase! * @param tags0 * @param tags1 */ public AbstractScraper(final Set<String> tags0, final Set<String> tags1) { this.tags0 = tags0; this.tags1 = tags1; } @Override public boolean isTag0(final String tag) { return (this.tags0 != null) && (this.tags0.contains(tag.toLowerCase())); } @Override public boolean isTag1(final String tag) { return (this.tags1 != null) && (this.tags1.contains(tag.toLowerCase())); } //the 'missing' method that shall be implemented: @Override public abstract void scrapeText(char[] text, String insideTag); // the other methods must take into account to construct the return value correctly @Override public abstract void scrapeTag0(ContentScraper.Tag tag); @Override public abstract void scrapeTag1(ContentScraper.Tag tag); public static String stripAllTags(final char[] s) { if (s.length > 80 && !MemoryControl.request(s.length * 2, false)) return ""; final StringBuilder r = new StringBuilder(s.length); int bc = 0; for (final char c : s) { if (c == lb) { bc++; if (r.length() > 0 && r.charAt(r.length() - 1) != sp) r.append(sp); } else if (c == rb) { bc--; } else if (bc <= 0) { r.append(c); } } return r.toString().trim(); } protected final static String cleanLine(final String s) { if (!MemoryControl.request(s.length() * 2, false)) return EMPTY_STRING; final StringBuilder sb = new StringBuilder(s.length()); char l = ' '; char c; for (int i = 0; i < s.length(); i++) { c = s.charAt(i); if (c < ' ') c = ' '; if (c == ' ') { if (l != ' ') sb.append(c); } else { sb.append(c); } l = c; } // return result return sb.toString().trim(); } @Override public void close() { // free resources this.tags0 = null; this.tags1 = null; } public static void main(String[] args) { String t = "<script src=\"navigation.js\" type=\"text/javascript\"></script>\\n <script src=\"../js/prototype.js\" type=\"text/javascript\"></script>"; System.out.println("'" + stripAllTags(t.toCharArray()) + "'"); } }