/* * File : $Source: /alkacon/cvs/alkacon/com.alkacon.opencms.v8.formgenerator/src/com/alkacon/opencms/v8/formgenerator/CmsHtmlToTextConverter.java,v $ * Date : $Date: 2010/05/21 13:49:15 $ * Version: $Revision: 1.1 $ * * This file is part of the Alkacon OpenCms Add-On Module Package * * Copyright (c) 2010 Alkacon Software GmbH (http://www.alkacon.com) * * The Alkacon OpenCms Add-On Module Package is free software: * you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation, either version 3 of the License, or * (at your option) any later version. * * The Alkacon OpenCms Add-On Module Package is distributed * in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with the Alkacon OpenCms Add-On Module Package. * If not, see http://www.gnu.org/licenses/. * * For further information about Alkacon Software GmbH, please see the * company website: http://www.alkacon.com. * * For further information about OpenCms, please see the * project website: http://www.opencms.org. */ package com.alkacon.opencms.v8.formgenerator; import org.opencms.util.CmsHtmlParser; import org.opencms.util.CmsStringUtil; import java.util.Iterator; import java.util.List; import org.htmlparser.Tag; import org.htmlparser.Text; import org.htmlparser.util.Translate; /** * Removes HTML tags and replaces them by line breaks or blanks.<p> * * @author Andreas Zahner */ public class CmsHtmlToTextConverter extends CmsHtmlParser { /** Indicated to append or store the next line breaks. */ private boolean m_appendBr; /** The last appended line break count. */ private int m_brCount; /** The current indentation. */ private int m_indent; /** The current line length. */ private int m_lineLength; /** The marker String (for headlines, bullets etc.). */ private String m_marker; /** The maximum line length. */ private int m_maxLineLength; /** The last stored, but not appended line break count. */ private int m_storedBrCount; /** Indicates if blanks should be added instead of line breaks. */ private boolean m_useBlankForLinebreak; /** * Creates a new instance of the html converter.<p> */ public CmsHtmlToTextConverter() { m_result = new StringBuffer(512); m_maxLineLength = 100; } /** * Extracts the text from the given html content, assuming the given html encoding.<p> * * @param html the content to extract the plain text from * @param encoding the encoding to use * * @return the text extracted from the given html content * * @throws Exception if something goes wrong */ public static String htmlToText(String html, String encoding) throws Exception { // create the converter instance CmsHtmlToTextConverter visitor = new CmsHtmlToTextConverter(); return visitor.process(html, encoding); } /** * Extracts the text from the given html content, assuming the given html encoding.<p> * * @param html the content to extract the plain text from * @param encoding the encoding to use * @param useBlankForLinebreak indicates if blanks should be added instead of line breaks * * @return the text extracted from the given html content * * @throws Exception if something goes wrong */ public static String htmlToText(String html, String encoding, boolean useBlankForLinebreak) throws Exception { // create the converter instance CmsHtmlToTextConverter visitor = new CmsHtmlToTextConverter(); visitor.setUseBlankForLinebreak(useBlankForLinebreak); return visitor.process(html, encoding); } /** * @see org.htmlparser.visitors.NodeVisitor#visitEndTag(org.htmlparser.Tag) */ @Override public void visitEndTag(Tag tag) { m_appendBr = false; appendLinebreaks(tag, false); } /** * @see org.htmlparser.visitors.NodeVisitor#visitStringNode(org.htmlparser.Text) */ @Override public void visitStringNode(Text text) { appendText(text.toPlainTextString()); } /** * @see org.htmlparser.visitors.NodeVisitor#visitTag(org.htmlparser.Tag) */ @Override public void visitTag(Tag tag) { m_appendBr = true; appendLinebreaks(tag, true); } /** * Sets if blanks should be added instead of line breaks.<p> * * @param useBlankForLinebreak <code>true</code> if blanks should be added instead of line breaks */ protected void setUseBlankForLinebreak(boolean useBlankForLinebreak) { m_useBlankForLinebreak = useBlankForLinebreak; } /** * Appends an indentation to the result.<p> */ private void appendIndentation() { if (m_lineLength <= m_indent) { int len = (m_marker != null) ? m_indent - (m_marker.length() + 1) : m_indent; for (int i = 0; i < len; i++) { m_result.append(' '); } if (m_marker != null) { m_result.append(m_marker); m_result.append(' '); m_marker = null; } } } /** * Appends line breaks to the result.<p> * * @param count the number of line breaks to append */ private void appendLinebreak(int count) { appendLinebreak(count, false); } /** * Appends line breaks to the result.<p> * * @param count the number of line breaks to append * @param force if the number of line breaks should be forced */ private void appendLinebreak(int count, boolean force) { if (m_appendBr) { if (m_storedBrCount > count) { count = m_storedBrCount; } m_storedBrCount = 0; if (force) { m_brCount = 0; } String brStr = "\r\n"; if (m_useBlankForLinebreak) { brStr = " "; } while (m_brCount < count) { m_result.append(brStr); m_brCount++; } m_lineLength = m_indent; } else { while (m_storedBrCount < count) { m_storedBrCount++; } } } /** * Appends line breaks using the specified tag.<p> * * @param tag the tag name * @param open the open flag */ private void appendLinebreaks(Tag tag, boolean open) { String name = tag.getTagName(); int pos = TAG_LIST.indexOf(name); switch (pos) { case 0: // H1 case 1: // H2 case 2: // H3 case 3: // H4 case 4: // H5 case 5: // H6 appendLinebreak(2); break; case 6: // P case 7: // DIV appendLinebreak(2); break; case 8: // SPAN break; case 9: // BR appendLinebreak(1, true); break; case 10: // OL case 11: // UL appendLinebreak(2); break; case 12: // LI setMarker("*", open); setIndentation(5, open); appendLinebreak(1); break; case 13: // TABLE setIndentation(5, open); appendLinebreak(2); if (open) { appendLinebreak(1); appendText("-----"); appendLinebreak(1); } break; case 14: // TD setMarker("--", open); appendLinebreak(2); break; case 15: // TR if (!open) { appendLinebreak(1); appendText("-----"); appendLinebreak(1); } break; case 16: // TH case 17: // THEAD case 18: // TBODY case 19: // TFOOT appendLinebreak(1); break; default: // unknown tag (ignore) } } /** * Appends text.<p> * * @param text the text */ private void appendText(String text) { if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) { text = Translate.decode(text); text = collapse(text); } if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(text)) { if (m_storedBrCount > 0) { m_appendBr = true; appendLinebreak(m_storedBrCount); } appendIndentation(); m_brCount = 0; List<String> wordList = CmsStringUtil.splitAsList(text, ' '); Iterator<String> i = wordList.iterator(); while (i.hasNext()) { String word = i.next(); boolean hasNbsp = ((word.charAt(0) == 160) || (word.charAt(word.length() - 1) == 160)); if ((word.length() + 1 + m_lineLength) > m_maxLineLength) { m_appendBr = true; appendLinebreak(1); appendIndentation(); m_brCount = 0; } else { if (!hasNbsp && (m_lineLength > m_indent) && (m_result.charAt(m_result.length() - 1) != 160) && (m_result.charAt(m_result.length() - 1) != 32)) { m_result.append(' '); m_lineLength++; } } m_result.append(word); m_lineLength += word.length(); } } } /** * Sets the indentation.<p> * * @param length the indentation length * @param open if the indentation should be increased or reduced */ private void setIndentation(int length, boolean open) { if (open) { m_indent += length; } else { m_indent -= length; if (m_indent < 0) { m_indent = 0; } } } /** * Sets a marker.<p> * * @param marker the marker * @param open if the marker should be set */ private void setMarker(String marker, boolean open) { if (open) { m_marker = marker; } } }