/*
* File : $Source: /alkacon/cvs/alkacon/com.alkacon.opencms.htmlcleaner/src/com/alkacon/opencms/htmlcleaner/CmsHtmlCleanerConfiguration.java,v $
* Date : $Date: 2011/04/01 10:08:02 $
* Version: $Revision: 1.1 $
*
* This file is part of the Alkacon OpenCms Add-On Module Package
*
* Copyright (c) 2011 Alkacon Software GmbH (http://www.alkacon.com)
*
* The Alkacon OpenCms Add-On Module Package is free software:
* you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* The Alkacon OpenCms Add-On Module Package is distributed
* in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with the Alkacon OpenCms Add-On Module Package.
* If not, see http://www.gnu.org/licenses/.
*
* For further information about Alkacon Software GmbH, please see the
* company website: http://www.alkacon.com.
*
* For further information about OpenCms, please see the
* project website: http://www.opencms.org.
*/
package com.alkacon.opencms.htmlcleaner;
import org.opencms.file.CmsObject;
import org.opencms.main.CmsLog;
import org.opencms.util.CmsStringUtil;
import org.opencms.xml.content.CmsXmlContent;
import org.opencms.xml.types.I_CmsXmlContentValue;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import org.apache.commons.logging.Log;
/**
* The configuration for the HTML cleaner.<p>
*
* @author Andreas Zahner
*/
public class CmsHtmlCleanerConfiguration {
/** Value for tag name indicating that all tags should be investigated. */
protected static final String TAGS_ALL = "*";
/** The log object for this class. */
private static final Log LOG = CmsLog.getLog(CmsHtmlCleanerConfiguration.class);
/** The node name of the "AttributeName" node. */
private static final String N_ATTRIBUTENAME = "AttributeName";
/** The node name of the "AttributeValue" node. */
private static final String N_ATTRIBUTEVALUE = "AttributeValue";
/** The node name of the "ExcludeTag" node. */
private static final String N_EXCLUDETAG = "ExcludeTag";
/** The node name of the "IgnoreTag" node. */
private static final String N_IGNORETAG = "IgnoreTag";
/** The node name of the "KeepTag" node. */
private static final String N_KEEPTAG = "KeepTag";
/** The node name of the "KeepTags" node. */
private static final String N_KEEPTAGS = "KeepTags";
/** The node name of the "RemoveAttribute" node. */
private static final String N_REMOVEATTRIBUTE = "RemoveAttribute";
/** The node name of the "RemoveAttributes" node. */
private static final String N_REMOVEATTRIBUTES = "RemoveAttributes";
/** The node name of the "ReplaceTag" node. */
private static final String N_REPLACETAG = "ReplaceTag";
/** The node name of the "ReplaceTags" node. */
private static final String N_REPLACETAGS = "ReplaceTags";
/** The node name of the "TagName" node. */
private static final String N_TAGNAME = "TagName";
/** The node name of the "TagReplace" node. */
private static final String N_TAGREPLACE = "TagReplace";
/** The map of attributes with exclusion definitions as values to remove from the tags. */
private Map<String, Map<String, Pattern>> m_invalidAttributes;
/** The list of elements on which to keep the invalid attributes. */
private List<String> m_keepAttributeElements;
/** The map of elements to replace. */
private Map<String, String> m_replaceElements;
/** The list of element names to keep in the result. */
private List<String> m_validElementNames;
/**
* Empty constructor, the {@link #init(CmsObject, CmsXmlContent)} method has to be triggered manually after generating an instance.<p>
*/
public CmsHtmlCleanerConfiguration() {
// nothing to do here
}
/**
* Constructor, with parameters, that initializes the cleaner configuration.<p>
*
* @param cms the current users context
* @param content the configuration as XML content
*/
public CmsHtmlCleanerConfiguration(CmsObject cms, CmsXmlContent content) {
init(cms, content);
}
/**
* Returns the map of attributes to remove from the tags.<p>
*
* The keys are the names of the attributes,
* values are maps of eventual exclusion definitions or <code>null</code> if there are no exclusions defined.<p>
*
* @return the list of attributes to remove from the tags
*/
public Map<String, Map<String, Pattern>> getInvalidAttributes() {
return m_invalidAttributes;
}
/**
* Returns the list of elements on which to keep the invalid attributes.<p>
*
* @return the list of elements on which to keep the invalid attributes
*/
public List<String> getKeepAttributeElements() {
return m_keepAttributeElements;
}
/**
* Returns the map of elements to replace.<p>
*
* @return the map of elements to replace
*/
public Map<String, String> getReplaceElements() {
return m_replaceElements;
}
/**
* Returns the list of element names to keep in the result.<p>
*
* @return the list of element names to keep in the result
*/
public List<String> getValidElementNames() {
return m_validElementNames;
}
/**
* Initializes the cleaner configuration using the given XML content.<p>
*
* @param cms the current users context
* @param content the configuration as XML content
*/
public void init(CmsObject cms, CmsXmlContent content) {
// initialize members
m_validElementNames = new ArrayList<String>();
m_replaceElements = new HashMap<String, String>();
m_invalidAttributes = new HashMap<String, Map<String, Pattern>>();
m_keepAttributeElements = new ArrayList<String>();
// use first locale found in content for configuration
Locale locale = content.getLocales().get(0);
// get the tags to keep
List<I_CmsXmlContentValue> keepTags = content.getValues(N_KEEPTAGS + "[1]/" + N_KEEPTAG, locale);
for (Iterator<I_CmsXmlContentValue> i = keepTags.iterator(); i.hasNext();) {
I_CmsXmlContentValue value = i.next();
String tagName = value.getStringValue(cms);
if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)) {
m_validElementNames.add(tagName.toLowerCase());
}
}
// get the tags to replace
List<I_CmsXmlContentValue> replaceTags = content.getValues(N_REPLACETAGS + "[1]/" + N_REPLACETAG, locale);
for (Iterator<I_CmsXmlContentValue> i = replaceTags.iterator(); i.hasNext();) {
I_CmsXmlContentValue value = i.next();
String path = value.getPath() + "/";
String tagName = content.getStringValue(cms, path + N_TAGNAME, locale);
String tagReplace = content.getStringValue(cms, path + N_TAGREPLACE, locale);
if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)
&& CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagReplace)) {
m_replaceElements.put(tagName.toLowerCase(), tagReplace.toLowerCase());
}
}
// get the attributes to remove
List<I_CmsXmlContentValue> removeAttributes = content.getValues(
N_REMOVEATTRIBUTES + "[1]/" + N_REMOVEATTRIBUTE,
locale);
for (Iterator<I_CmsXmlContentValue> i = removeAttributes.iterator(); i.hasNext();) {
I_CmsXmlContentValue value = i.next();
String path = value.getPath() + "/";
String attrName = content.getStringValue(cms, path + N_ATTRIBUTENAME, locale);
if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(attrName)) {
Map<String, Pattern> exclusions = null;
// check if there are individual exclusion sub elements
List<I_CmsXmlContentValue> excludeTags = content.getValues(path + N_EXCLUDETAG, locale);
if (excludeTags.size() > 0) {
// found at least one exclusion, create map
exclusions = new HashMap<String, Pattern>(excludeTags.size());
for (Iterator<I_CmsXmlContentValue> k = excludeTags.iterator(); k.hasNext();) {
// get exclusions for the attribute
I_CmsXmlContentValue excludeValue = k.next();
path = excludeValue.getPath() + "/";
String tagName = content.getStringValue(cms, path + N_TAGNAME, locale);
String attrValue = content.getStringValue(cms, path + N_ATTRIBUTEVALUE, locale);
if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(tagName)
&& CmsStringUtil.isNotEmptyOrWhitespaceOnly(attrValue)) {
try {
// build pattern from regular expression
Pattern regExPattern = Pattern.compile(attrValue);
exclusions.put(tagName.toLowerCase(), regExPattern);
} catch (PatternSyntaxException e) {
// invalid regular expression found
LOG.error(
"Invalid regular expression \"" + attrValue + "\" defined for HTML cleaner",
e);
}
}
}
}
m_invalidAttributes.put(attrName.toLowerCase(), exclusions);
}
}
// get the tags where to keep the attributes
List<I_CmsXmlContentValue> ignoreTags = content.getValues(N_REMOVEATTRIBUTES + "[1]/" + N_IGNORETAG, locale);
for (Iterator<I_CmsXmlContentValue> i = ignoreTags.iterator(); i.hasNext();) {
I_CmsXmlContentValue value = i.next();
String strValue = value.getStringValue(cms);
if (CmsStringUtil.isNotEmptyOrWhitespaceOnly(strValue)) {
m_keepAttributeElements.add(strValue.toLowerCase());
}
}
}
/**
* Determines if the given attribute should be kept or not on the given tag.<p>
*
* @param tagName the name of the tag containing the attribute
* @param attrName the attribute name
* @param attrValue the attribute value
*
* @return <code>true</code> if the attribute should be kept according to the configuration, otherwise <code>false</code>
*/
public boolean isKeepAttributeOnTag(String tagName, String attrName, String attrValue) {
if (getKeepAttributeElements().contains(tagName)) {
// the attribute has to be kept because the tag should be ignored
return true;
}
if (getInvalidAttributes().containsKey(attrName)) {
// this is an invalid attribute, check exclusions
Map<String, Pattern> exclusions = getInvalidAttributes().get(attrName);
if ((exclusions != null) && CmsStringUtil.isNotEmptyOrWhitespaceOnly(attrValue)) {
// found exclusions, check if the attribute matches one
if (exclusions.containsKey(tagName)) {
Pattern regExPattern = exclusions.get(tagName);
if (regExPattern.matcher(attrValue).matches()) {
// attribute value matches regular expression, keep tag
return true;
}
}
// also check if an exclusion is defined for every tag
if (exclusions.containsKey(TAGS_ALL)) {
Pattern regExPattern = exclusions.get(TAGS_ALL);
if (regExPattern.matcher(attrValue).matches()) {
// attribute value matches regular expression, keep tag
return true;
}
}
}
// attribute found in invalid attribute definitions, no exclusions are matching, remove attribute
return false;
}
// attribute not found in invalid attribute definitions, keep it
return true;
}
}