/***************************************************************************
* The Lecture2Go software is based on the liferay portal 6.1.1
* <http://www.liferay.com>
*
* Lecture2Go <http://lecture2go.uni-hamburg.de> is an open source
* platform for media management and distribution. Our goal is to
* support the free access to knowledge because this is a component
* of each democratic society. The open source software is aimed at
* academic institutions and has to strengthen the blended learning.
*
* All Lecture2Go plugins are continuously being developed and improved.
* For more details please visit <http://lecture2go-open-source.rrz.uni-hamburg.de>
*
* @Autor Lecture2Go Team
* @Version 1.0
* @Contact lecture2go-open-source@uni-hamburg.de
*
* Copyright (c) 2013 University of Hamburg / Computer and Data Center (RRZ)
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program. If not, see <http://www.gnu.org/licenses/>.
***************************************************************************/
package com.josephoconnell.html;
/**
*
* HTML filtering utility for protecting against XSS (Cross Site Scripting).
*
* This code is licensed under a Creative Commons Attribution-ShareAlike 2.5 License
* http://creativecommons.org/licenses/by-sa/2.5/
*
* This code is a Java port of the original work in PHP by Cal Hendersen.
* http://code.iamcal.com/php/lib_filter/
*
* The trickiest part of the translation was handling the differences in regex handling
* between PHP and Java. These resources were helpful in the process:
*
* http://java.sun.com/j2se/1.4.2/docs/api/java/util/regex/Pattern.html
* http://us2.php.net/manual/en/reference.pcre.pattern.modifiers.php
* http://www.regular-expressions.info/modifiers.html
*
* A note on naming conventions: instance variables are prefixed with a "v"; global
* constants are in all caps.
*
* Sample use:
* String input = ...
* String clean = new HTMLInputFilter().filter( input );
*
* If you find bugs or have suggestions on improvement (especially regarding
* perfomance), please contact me at the email below. The latest version of this
* source can be found at
*
* http://josephoconnell.com/java/xss-html-filter/
*
* @author Joseph O'Connell <joe.oconnell at gmail dot com>
* @version 1.0
*/
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import junit.framework.Assert;
import junit.framework.TestCase;
/**
* The Class HTMLInputFilter.
*/
public class HTMLInputFilter
{
/** The Constant ALWAYS_MAKE_TAGS. */
protected static final boolean ALWAYS_MAKE_TAGS = true;
/** The Constant STRIP_COMMENTS. */
protected static final boolean STRIP_COMMENTS = true;
/** The Constant REGEX_FLAGS_SI. */
protected static final int REGEX_FLAGS_SI = Pattern.CASE_INSENSITIVE | Pattern.DOTALL;
/** The v allowed. */
protected Map<String,List<String>> vAllowed;
/** The v tag counts. */
protected Map<String,Integer> vTagCounts;
/** The v self closing tags. */
protected String[] vSelfClosingTags;
/** The v need closing tags. */
protected String[] vNeedClosingTags;
/** The v protocol atts. */
protected String[] vProtocolAtts;
/** The v allowed protocols. */
protected String[] vAllowedProtocols;
/** The v remove blanks. */
protected String[] vRemoveBlanks;
/** The v allowed entities. */
protected String[] vAllowedEntities;
/** The v debug. */
protected boolean vDebug;
/**
* Instantiates a new hTML input filter.
*/
public HTMLInputFilter()
{
this(false);
}
/**
* Instantiates a new hTML input filter.
*
* @param debug the debug
*/
public HTMLInputFilter( boolean debug )
{
vDebug = debug;
vAllowed = new HashMap<String,List<String>>();
vTagCounts = new HashMap<String,Integer>();
ArrayList<String> a_atts = new ArrayList<String>();
a_atts.add( "href" );
a_atts.add( "target" );
vAllowed.put( "a", a_atts );
ArrayList<String> img_atts = new ArrayList<String>();
img_atts.add( "src" );
img_atts.add( "width" );
img_atts.add( "height" );
img_atts.add( "alt" );
vAllowed.put( "img", img_atts );
ArrayList<String> no_atts = new ArrayList<String>();
vAllowed.put( "b", no_atts );
vAllowed.put( "strong", no_atts );
vAllowed.put( "i", no_atts );
vAllowed.put( "em", no_atts );
vSelfClosingTags = new String[] { "img" };
vNeedClosingTags = new String[] { "a", "b", "strong", "i", "em" };
vAllowedProtocols = new String[] { "http", "mailto" }; // no ftp.
vProtocolAtts = new String[] { "src", "href" };
vRemoveBlanks = new String[] { "a", "b", "strong", "i", "em" };
vAllowedEntities = new String[] { "amp", "gt", "lt", "quot" };
}
/**
* Reset.
*/
protected void reset()
{
vTagCounts = new HashMap<String,Integer>();
}
/**
* Debug.
*
* @param msg the msg
*/
protected void debug( String msg )
{
if (vDebug)
System.out.println( msg );
}
//---------------------------------------------------------------
// my versions of some PHP library functions
/**
* Chr.
*
* @param decimal the decimal
* @return the string
*/
public static String chr( int decimal )
{
return String.valueOf( (char) decimal );
}
/**
* Html special chars.
*
* @param s the s
* @return the string
*/
public static String htmlSpecialChars( String s )
{
s = s.replaceAll( "&", "&" );
s = s.replaceAll( "\"", """ );
s = s.replaceAll( "<", "<" );
s = s.replaceAll( ">", ">" );
return s;
}
//---------------------------------------------------------------
/**
* Filter.
*
* @param input the input
* @return the string
*/
public synchronized String filter( String input )
{
reset();
String s = input;
debug( "************************************************" );
debug( " INPUT: " + input );
s = escapeComments(s);
debug( " escapeComments: " + s );
s = balanceHTML(s);
debug( " balanceHTML: " + s );
s = checkTags(s);
debug( " checkTags: " + s );
s = processRemoveBlanks(s);
debug( "processRemoveBlanks: " + s );
s = validateEntities(s);
debug( " validateEntites: " + s );
debug( "************************************************\n\n" );
return s;
}
/**
* Escape comments.
*
* @param s the s
* @return the string
*/
protected String escapeComments( String s )
{
Pattern p = Pattern.compile( "<!--(.*?)-->", Pattern.DOTALL );
Matcher m = p.matcher( s );
StringBuffer buf = new StringBuffer();
if (m.find()) {
String match = m.group( 1 ); //(.*?)
m.appendReplacement( buf, "<!--" + htmlSpecialChars( match ) + "-->" );
}
m.appendTail( buf );
return buf.toString();
}
/**
* Balance html.
*
* @param s the s
* @return the string
*/
protected String balanceHTML( String s )
{
if (ALWAYS_MAKE_TAGS)
{
//
// try and form html
//
s = regexReplace("^>", "", s);
s = regexReplace("<([^>]*?)(?=<|$)", "<$1>", s);
s = regexReplace("(^|>)([^<]*?)(?=>)", "$1<$2", s);
}
else
{
//
// escape stray brackets
//
s = regexReplace("<([^>]*?)(?=<|$)", "<$1", s);
s = regexReplace("(^|>)([^<]*?)(?=>)", "$1$2><", s);
//
// the last regexp causes '<>' entities to appear
// (we need to do a lookahead assertion so that the last bracket can
// be used in the next pass of the regexp)
//
s = s.replaceAll("<>", "");
}
return s;
}
/**
* Check tags.
*
* @param s the s
* @return the string
*/
protected String checkTags( String s )
{
Pattern p = Pattern.compile( "<(.*?)>", Pattern.DOTALL );
Matcher m = p.matcher( s );
StringBuffer buf = new StringBuffer();
while (m.find()) {
String replaceStr = m.group( 1 );
replaceStr = processTag( replaceStr );
m.appendReplacement(buf, replaceStr);
}
m.appendTail(buf);
s = buf.toString();
// these get tallied in processTag
// (remember to reset before subsequent calls to filter method)
for( String key : vTagCounts.keySet())
{
for(int ii=0; ii<vTagCounts.get(key); ii++) {
s += "</" + key + ">";
}
}
return s;
}
/**
* Process remove blanks.
*
* @param s the s
* @return the string
*/
protected String processRemoveBlanks( String s )
{
for( String tag : vRemoveBlanks )
{
s = regexReplace( "<" + tag + "(\\s[^>]*)?></" + tag + ">", "", s );
s = regexReplace( "<" + tag + "(\\s[^>]*)?/>", "", s );
}
return s;
}
/**
* Regex replace.
*
* @param regex_pattern the regex_pattern
* @param replacement the replacement
* @param s the s
* @return the string
*/
protected String regexReplace( String regex_pattern, String replacement, String s )
{
Pattern p = Pattern.compile( regex_pattern );
Matcher m = p.matcher( s );
return m.replaceAll( replacement );
}
/**
* Process tag.
*
* @param s the s
* @return the string
*/
protected String processTag( String s )
{
// ending tags
Pattern p = Pattern.compile( "^/([a-z0-9]+)", REGEX_FLAGS_SI );
Matcher m = p.matcher( s );
if (m.find()) {
String name = m.group(1).toLowerCase();
if (vAllowed.containsKey( name )) {
if (!inArray(name, vSelfClosingTags)) {
if (vTagCounts.containsKey( name )) {
vTagCounts.put(name, vTagCounts.get(name)-1);
return "</" + name + ">";
}
}
}
}
// starting tags
p = Pattern.compile("^([a-z0-9]+)(.*?)(/?)$", REGEX_FLAGS_SI);
m = p.matcher( s );
if (m.find()) {
String name = m.group(1).toLowerCase();
String body = m.group(2);
String ending = m.group(3);
//debug( "in a starting tag, name='" + name + "'; body='" + body + "'; ending='" + ending + "'" );
if (vAllowed.containsKey( name )) {
String params = "";
Pattern p2 = Pattern.compile("([a-z0-9]+)=([\"'])(.*?)\\2", REGEX_FLAGS_SI);
Pattern p3 = Pattern.compile("([a-z0-9]+)(=)([^\"\\s']+)", REGEX_FLAGS_SI);
Matcher m2 = p2.matcher( body );
Matcher m3 = p3.matcher( body );
List<String> paramNames = new ArrayList<String>();
List<String> paramValues = new ArrayList<String>();
while (m2.find()) {
paramNames.add(m2.group(1)); //([a-z0-9]+)
paramValues.add(m2.group(3)); //(.*?)
}
while (m3.find()) {
paramNames.add(m3.group(1)); //([a-z0-9]+)
paramValues.add(m3.group(3)); //([^\"\\s']+)
}
String paramName, paramValue;
for( int ii=0; ii<paramNames.size(); ii++ ) {
paramName = paramNames.get(ii).toLowerCase();
paramValue = paramValues.get(ii);
//debug( "paramName='" + paramName + "'" );
//debug( "paramValue='" + paramValue + "'" );
//debug( "allowed? " + vAllowed.get( name ).contains( paramName ) );
if (vAllowed.get( name ).contains( paramName )) {
if (inArray( paramName, vProtocolAtts )) {
paramValue = processParamProtocol( paramValue );
}
params += " " + paramName + "=\"" + paramValue + "\"";
}
}
if (inArray( name, vSelfClosingTags )) {
ending = " /";
}
if (inArray( name, vNeedClosingTags )) {
ending = "";
}
if (ending == null || ending.length() < 1) {
if (vTagCounts.containsKey( name )) {
vTagCounts.put( name, vTagCounts.get(name)+1 );
} else {
vTagCounts.put( name, 1 );
}
} else {
ending = " /";
}
return "<" + name + params + ending + ">";
} else {
return "";
}
}
// comments
p = Pattern.compile( "^!--(.*)--$", REGEX_FLAGS_SI );
m = p.matcher( s );
if (m.find()) {
String comment = m.group();
if (STRIP_COMMENTS) {
return "";
} else {
return "<" + comment + ">";
}
}
return "";
}
/**
* Process param protocol.
*
* @param s the s
* @return the string
*/
protected String processParamProtocol( String s )
{
s = decodeEntities( s );
Pattern p = Pattern.compile( "^([^:]+):", REGEX_FLAGS_SI );
Matcher m = p.matcher( s );
if (m.find()) {
String protocol = m.group(1);
if (!inArray( protocol, vAllowedProtocols )) {
// bad protocol, turn into local anchor link instead
s = "#" + s.substring( protocol.length()+1, s.length() );
if (s.startsWith("#//")) s = "#" + s.substring( 3, s.length() );
}
}
return s;
}
/**
* Decode entities.
*
* @param s the s
* @return the string
*/
protected String decodeEntities( String s )
{
StringBuffer buf = new StringBuffer();
Pattern p = Pattern.compile( "(\\d+);?" );
Matcher m = p.matcher( s );
while (m.find()) {
String match = m.group( 1 );
int decimal = Integer.decode( match ).intValue();
m.appendReplacement( buf, chr( decimal ) );
}
m.appendTail( buf );
s = buf.toString();
buf = new StringBuffer();
p = Pattern.compile( "([0-9a-f]+);?");
m = p.matcher( s );
while (m.find()) {
String match = m.group( 1 );
int decimal = Integer.decode( match ).intValue();
m.appendReplacement( buf, chr( decimal ) );
}
m.appendTail( buf );
s = buf.toString();
buf = new StringBuffer();
p = Pattern.compile( "%([0-9a-f]{2});?");
m = p.matcher( s );
while (m.find()) {
String match = m.group( 1 );
int decimal = Integer.decode( match ).intValue();
m.appendReplacement( buf, chr( decimal ) );
}
m.appendTail( buf );
s = buf.toString();
s = validateEntities( s );
return s;
}
/**
* Validate entities.
*
* @param s the s
* @return the string
*/
protected String validateEntities( String s )
{
// validate entities throughout the string
Pattern p = Pattern.compile( "&([^&;]*)(?=(;|&|$))" );
Matcher m = p.matcher( s );
if (m.find()) {
String one = m.group( 1 ); //([^&;]*)
String two = m.group( 2 ); //(?=(;|&|$))
s = checkEntity( one, two );
}
// validate quotes outside of tags
p = Pattern.compile( "(>|^)([^<]+?)(<|$)", Pattern.DOTALL );
m = p.matcher( s );
StringBuffer buf = new StringBuffer();
if (m.find()) {
String one = m.group( 1 ); //(>|^)
String two = m.group( 2 ); //([^<]+?)
String three = m.group( 3 ); //(<|$)
m.appendReplacement( buf, one + two.replaceAll( "\"", """ ) + three);
}
m.appendTail( buf );
return s;
}
/**
* Check entity.
*
* @param preamble the preamble
* @param term the term
* @return the string
*/
protected String checkEntity( String preamble, String term )
{
if (!term.equals(";")) {
return "&" + preamble;
}
if ( isValidEntity( preamble ) ) {
return "&" + preamble;
}
return "&" + preamble;
}
/**
* Checks if is valid entity.
*
* @param entity the entity
* @return true, if is valid entity
*/
protected boolean isValidEntity( String entity )
{
return inArray( entity, vAllowedEntities );
}
/**
* In array.
*
* @param s the s
* @param array the array
* @return true, if successful
*/
private boolean inArray( String s, String[] array )
{
for (String item : array)
if (item != null && item.equals(s))
return true;
return false;
}
// ============================================ START-UNIT-TEST =========================================
/**
* The Class Test.
*/
public static class Test extends TestCase
{
/** The v filter. */
protected HTMLInputFilter vFilter;
/* (non-Javadoc)
* @see junit.framework.TestCase#setUp()
*/
/**
* Sets the up.
*/
protected void setUp()
{
vFilter = new HTMLInputFilter( true );
}
/* (non-Javadoc)
* @see junit.framework.TestCase#tearDown()
*/
/**
* Tear down.
*/
protected void tearDown()
{
vFilter = null;
}
/**
* T.
*
* @param input the input
* @param result the result
*/
private void t( String input, String result )
{
Assert.assertEquals( result, vFilter.filter(input) );
}
/**
* Test_basics.
*/
public void test_basics()
{
t( "", "" );
t( "hello", "hello" );
}
/**
* Test_balancing_tags.
*/
public void test_balancing_tags()
{
t( "<b>hello", "<b>hello</b>" );
t( "<b>hello", "<b>hello</b>" );
t( "hello<b>", "hello" );
t( "hello</b>", "hello" );
t( "hello<b/>", "hello" );
t( "<b><b><b>hello", "<b><b><b>hello</b></b></b>" );
t( "</b><b>", "" );
}
/**
* Test_end_slashes.
*/
public void test_end_slashes()
{
t("<img>","<img />");
t("<img/>","<img />");
t("<b/></b>","");
}
/**
* Test_balancing_angle_brackets.
*/
public void test_balancing_angle_brackets()
{
if (ALWAYS_MAKE_TAGS) {
t("<img src=\"foo\"","<img src=\"foo\" />");
t("i>","");
t("<img src=\"foo\"/","<img src=\"foo\" />");
t(">","");
t("foo<b","foo");
t("b>foo","<b>foo</b>");
t("><b","");
t("b><","");
t("><b>","");
} else {
t("<img src=\"foo\"","<img src=\"foo\"");
t("b>","b>");
t("<img src=\"foo\"/","<img src=\"foo\"/");
t(">",">");
t("foo<b","foo<b");
t("b>foo","b>foo");
t("><b","><b");
t("b><","b><");
t("><b>",">");
}
}
/**
* Test_attributes.
*/
public void test_attributes()
{
t("<img src=foo>","<img src=\"foo\" />");
t("<img asrc=foo>","<img />");
t("<img src=test test>","<img src=\"test\" />");
}
/**
* Test_disallow_script_tags.
*/
public void test_disallow_script_tags()
{
t("<script>","");
if (ALWAYS_MAKE_TAGS) { t("<script",""); } else { t("<script","<script"); }
t("<script/>","");
t("</script>","");
t("<script woo=yay>","");
t("<script woo=\"yay\">","");
t("<script woo=\"yay>","");
t("<script woo=\"yay<b>","");
t("<script<script>>","");
t("<<script>script<script>>","script");
t("<<script><script>>","");
t("<<script>script>>","");
t("<<script<script>>","");
}
/**
* Test_protocols.
*/
public void test_protocols()
{
t("<a href=\"http://foo\">bar</a>", "<a href=\"http://foo\">bar</a>");
// we don't allow ftp. t("<a href=\"ftp://foo\">bar</a>", "<a href=\"ftp://foo\">bar</a>");
t("<a href=\"mailto:foo\">bar</a>", "<a href=\"mailto:foo\">bar</a>");
t("<a href=\"javascript:foo\">bar</a>", "<a href=\"#foo\">bar</a>");
t("<a href=\"java script:foo\">bar</a>", "<a href=\"#foo\">bar</a>");
t("<a href=\"java\tscript:foo\">bar</a>", "<a href=\"#foo\">bar</a>");
t("<a href=\"java\nscript:foo\">bar</a>", "<a href=\"#foo\">bar</a>");
t("<a href=\"java" + chr(1) + "script:foo\">bar</a>", "<a href=\"#foo\">bar</a>");
t("<a href=\"jscript:foo\">bar</a>", "<a href=\"#foo\">bar</a>");
t("<a href=\"vbscript:foo\">bar</a>", "<a href=\"#foo\">bar</a>");
t("<a href=\"view-source:foo\">bar</a>", "<a href=\"#foo\">bar</a>");
}
/**
* Test_self_closing_tags.
*/
public void test_self_closing_tags()
{
t("<img src=\"a\">","<img src=\"a\" />");
t("<img src=\"a\">foo</img>", "<img src=\"a\" />foo");
t("</img>", "");
}
/**
* Test_comments.
*/
public void test_comments()
{
if (STRIP_COMMENTS) {
t("<!-- a<b --->", "");
} else {
t("<!-- a<b --->", "<!-- a<b --->");
}
}
}
// ============================================ END-UNIT-TEST ===========================================
}