/*
* Copyright 2010 Alibaba Group Holding Limited.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* @(#)Clean.java 1.11 2000/08/16
*
*/
package org.w3c.tidy;
/**
* Clean up misuse of presentation markup (c) 1998-2000 (W3C) MIT, INRIA, Keio
* University See Tidy.java for the copyright notice. Derived from <a
* href="http://www.w3.org/People/Raggett/tidy"> HTML Tidy Release 4 Aug
* 2000</a>
*
* @author Dave Raggett <dsr@w3.org>
* @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
* @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
*/
/*
* Filters from other formats such as Microsoft Word often make excessive use of
* presentation markup such as font tags, B, I, and the align attribute. By
* applying a set of production rules, it is straight forward to transform this
* to use CSS. Some rules replace some of the children of an element by style
* properties on the element, e.g. <p><b>...</b></p> -> <p
* style="font-weight: bold">...</p> Such rules are applied to the element's
* content and then to the element itself until none of the rules more apply.
* Having applied all the rules to an element, it will have a style attribute
* with one or more properties. Other rules strip the element they apply to,
* replacing it by style properties on the contents, e.g.
* <dir><li><p>...</li></dir> -> <p style="margin-left 1em">... These rules are
* applied to an element before processing its content and replace the current
* element by the first element in the exposed content. After applying both sets
* of rules, you can replace the style attribute by a class value and style rule
* in the document head. To support this, an association of styles and class
* names is built. A naive approach is to rely on string matching to test when
* two property lists are the same. A better approach would be to first sort the
* properties before matching.
*/
public class Clean {
private int classNum = 1;
private TagTable tt;
public Clean(TagTable tt) {
this.tt = tt;
}
private StyleProp insertProperty(StyleProp props, String name, String value) {
StyleProp first, prev, prop;
int cmp;
prev = null;
first = props;
while (props != null) {
cmp = props.name.compareTo(name);
if (cmp == 0) {
/* this property is already defined, ignore new value */
return first;
}
if (cmp > 0) // props.name > name
{
/* insert before this */
prop = new StyleProp(name, value, props);
if (prev != null) {
prev.next = prop;
} else {
first = prop;
}
return first;
}
prev = props;
props = props.next;
}
prop = new StyleProp(name, value);
if (prev != null) {
prev.next = prop;
} else {
first = prop;
}
return first;
}
/*
* Create sorted linked list of properties from style string It temporarily
* places nulls in place of ':' and ';' to delimit the strings for the
* property name and value. Some systems don't allow you to null literal
* strings, so to avoid this, a copy is made first.
*/
private StyleProp createProps(StyleProp prop, String style) {
int name_end;
int value_end;
int value_start = 0;
int name_start = 0;
boolean more;
name_start = 0;
while (name_start < style.length()) {
while (name_start < style.length() && style.charAt(name_start) == ' ') {
++name_start;
}
name_end = name_start;
while (name_end < style.length()) {
if (style.charAt(name_end) == ':') {
value_start = name_end + 1;
break;
}
++name_end;
}
if (name_end >= style.length() || style.charAt(name_end) != ':') {
break;
}
while (value_start < style.length() && style.charAt(value_start) == ' ') {
++value_start;
}
value_end = value_start;
more = false;
while (value_end < style.length()) {
if (style.charAt(value_end) == ';') {
more = true;
break;
}
++value_end;
}
prop = insertProperty(prop, style.substring(name_start, name_end), style.substring(value_start, value_end));
if (more) {
name_start = value_end + 1;
continue;
}
break;
}
return prop;
}
private String createPropString(StyleProp props) {
String style = "";
int len;
StyleProp prop;
/* compute length */
for (len = 0, prop = props; prop != null; prop = prop.next) {
len += prop.name.length() + 2;
len += prop.value.length() + 2;
}
for (prop = props; prop != null; prop = prop.next) {
style = style.concat(prop.name);
style = style.concat(": ");
style = style.concat(prop.value);
if (prop.next == null) {
break;
}
style = style.concat("; ");
}
return style;
}
/*
* create string with merged properties
*/
private String addProperty(String style, String property) {
StyleProp prop;
prop = createProps(null, style);
prop = createProps(prop, property);
style = createPropString(prop);
return style;
}
private String gensymClass(String tag) {
String str;
str = "c" + classNum;
classNum++;
return str;
}
private String findStyle(Lexer lexer, String tag, String properties) {
Style style;
for (style = lexer.styles; style != null; style = style.next) {
if (style.tag.equals(tag) && style.properties.equals(properties)) {
return style.tagClass;
}
}
style = new Style(tag, gensymClass(tag), properties, lexer.styles);
lexer.styles = style;
return style.tagClass;
}
/*
* Find style attribute in node, and replace it by corresponding class
* attribute. Search for class in style dictionary otherwise gensym new
* class and add to dictionary. Assumes that node doesn't have a class
* attribute
*/
private void style2Rule(Lexer lexer, Node node) {
AttVal styleattr, classattr;
String classname;
styleattr = node.getAttrByName("style");
if (styleattr != null) {
classname = findStyle(lexer, node.element, styleattr.value);
classattr = node.getAttrByName("class");
/*
* if there already is a class attribute then append class name
* after a space
*/
if (classattr != null) {
classattr.value = classattr.value + " " + classname;
node.removeAttribute(styleattr);
} else /* reuse style attribute for class attribute */ {
styleattr.attribute = "class";
styleattr.value = classname;
}
}
}
private void addColorRule(Lexer lexer, String selector, String color) {
if (color != null) {
lexer.addStringLiteral(selector);
lexer.addStringLiteral(" { color: ");
lexer.addStringLiteral(color);
lexer.addStringLiteral(" }\n");
}
}
/*
* move presentation attribs from body to style element background="foo" ->
* body { background-image: url(foo) } bgcolor="foo" -> body {
* background-color: foo } text="foo" -> body { color: foo } link="foo" ->
* :link { color: foo } vlink="foo" -> :visited { color: foo } alink="foo"
* -> :active { color: foo }
*/
private void cleanBodyAttrs(Lexer lexer, Node body) {
AttVal attr;
String bgurl = null;
String bgcolor = null;
String color = null;
attr = body.getAttrByName("background");
if (attr != null) {
bgurl = attr.value;
attr.value = null;
body.removeAttribute(attr);
}
attr = body.getAttrByName("bgcolor");
if (attr != null) {
bgcolor = attr.value;
attr.value = null;
body.removeAttribute(attr);
}
attr = body.getAttrByName("text");
if (attr != null) {
color = attr.value;
attr.value = null;
body.removeAttribute(attr);
}
if (bgurl != null || bgcolor != null || color != null) {
lexer.addStringLiteral(" body {\n");
if (bgurl != null) {
lexer.addStringLiteral(" background-image: url(");
lexer.addStringLiteral(bgurl);
lexer.addStringLiteral(");\n");
}
if (bgcolor != null) {
lexer.addStringLiteral(" background-color: ");
lexer.addStringLiteral(bgcolor);
lexer.addStringLiteral(";\n");
}
if (color != null) {
lexer.addStringLiteral(" color: ");
lexer.addStringLiteral(color);
lexer.addStringLiteral(";\n");
}
lexer.addStringLiteral(" }\n");
}
attr = body.getAttrByName("link");
if (attr != null) {
addColorRule(lexer, " :link", attr.value);
body.removeAttribute(attr);
}
attr = body.getAttrByName("vlink");
if (attr != null) {
addColorRule(lexer, " :visited", attr.value);
body.removeAttribute(attr);
}
attr = body.getAttrByName("alink");
if (attr != null) {
addColorRule(lexer, " :active", attr.value);
body.removeAttribute(attr);
}
}
private boolean niceBody(Lexer lexer, Node doc) {
Node body = doc.findBody(lexer.configuration.tt);
if (body != null) {
if (body.getAttrByName("background") != null || body.getAttrByName("bgcolor") != null
|| body.getAttrByName("text") != null || body.getAttrByName("link") != null
|| body.getAttrByName("vlink") != null || body.getAttrByName("alink") != null) {
lexer.badLayout |= Report.USING_BODY;
return false;
}
}
return true;
}
/* create style element using rules from dictionary */
private void createStyleElement(Lexer lexer, Node doc) {
Node node, head, body;
Style style;
AttVal av;
if (lexer.styles == null && niceBody(lexer, doc)) {
return;
}
node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
node.implicit = true;
/* insert type attribute */
av = new AttVal(null, null, '"', "type", "text/css");
av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
node.attributes = av;
body = doc.findBody(lexer.configuration.tt);
lexer.txtstart = lexer.lexsize;
if (body != null) {
cleanBodyAttrs(lexer, body);
}
for (style = lexer.styles; style != null; style = style.next) {
lexer.addCharToLexer(' ');
lexer.addStringLiteral(style.tag);
lexer.addCharToLexer('.');
lexer.addStringLiteral(style.tagClass);
lexer.addCharToLexer(' ');
lexer.addCharToLexer('{');
lexer.addStringLiteral(style.properties);
lexer.addCharToLexer('}');
lexer.addCharToLexer('\n');
}
lexer.txtend = lexer.lexsize;
Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode, lexer.lexbuf, lexer.txtstart, lexer.txtend));
/*
* now insert style element into document head doc is root node. search
* its children for html node the head node should be first child of
* html node
*/
head = doc.findHEAD(lexer.configuration.tt);
if (head != null) {
Node.insertNodeAtEnd(head, node);
}
}
/* ensure bidirectional links are consistent */
private void fixNodeLinks(Node node) {
Node child;
if (node.prev != null) {
node.prev.next = node;
} else {
node.parent.content = node;
}
if (node.next != null) {
node.next.prev = node;
} else {
node.parent.last = node;
}
for (child = node.content; child != null; child = child.next) {
child.parent = node;
}
}
/*
* used to strip child of node when the node has one and only one child
*/
private void stripOnlyChild(Node node) {
Node child;
child = node.content;
node.content = child.content;
node.last = child.last;
child.content = null;
for (child = node.content; child != null; child = child.next) {
child.parent = node;
}
}
/* used to strip font start and end tags */
private void discardContainer(Node element, MutableObject pnode) {
Node node;
Node parent = element.parent;
if (element.content != null) {
element.last.next = element.next;
if (element.next != null) {
element.next.prev = element.last;
element.last.next = element.next;
} else {
parent.last = element.last;
}
if (element.prev != null) {
element.content.prev = element.prev;
element.prev.next = element.content;
} else {
parent.content = element.content;
}
for (node = element.content; node != null; node = node.next) {
node.parent = parent;
}
pnode.setObject(element.content);
} else {
if (element.next != null) {
element.next.prev = element.prev;
} else {
parent.last = element.prev;
}
if (element.prev != null) {
element.prev.next = element.next;
} else {
parent.content = element.next;
}
pnode.setObject(element.next);
}
element.next = null;
element.content = null;
}
/*
* Add style property to element, creating style attribute as needed and
* adding ; delimiter
*/
private void addStyleProperty(Node node, String property) {
AttVal av;
for (av = node.attributes; av != null; av = av.next) {
if (av.attribute.equals("style")) {
break;
}
}
/* if style attribute already exists then insert property */
if (av != null) {
String s;
s = addProperty(av.value, property);
av.value = s;
} else /* else create new style attribute */ {
av = new AttVal(node.attributes, null, '"', "style", property);
av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
node.attributes = av;
}
}
/*
* Create new string that consists of the combined style properties in s1
* and s2 To merge property lists, we build a linked list of property/values
* and insert properties into the list in order, merging values for the same
* property name.
*/
private String mergeProperties(String s1, String s2) {
String s;
StyleProp prop;
prop = createProps(null, s1);
prop = createProps(prop, s2);
s = createPropString(prop);
return s;
}
private void mergeStyles(Node node, Node child) {
AttVal av;
String s1, s2, style;
for (s2 = null, av = child.attributes; av != null; av = av.next) {
if (av.attribute.equals("style")) {
s2 = av.value;
break;
}
}
for (s1 = null, av = node.attributes; av != null; av = av.next) {
if (av.attribute.equals("style")) {
s1 = av.value;
break;
}
}
if (s1 != null) {
if (s2 != null) /* merge styles from both */ {
style = mergeProperties(s1, s2);
av.value = style;
}
} else if (s2 != null) /* copy style of child */ {
av = new AttVal(node.attributes, null, '"', "style", s2);
av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
node.attributes = av;
}
}
private String fontSize2Name(String size) {
/*
* String[] sizes = { "50%", "60%", "80%", null, "120%", "150%", "200%"
* };
*/
String[] sizes = { "60%", "70%", "80%", null, "120%", "150%", "200%" };
String buf;
if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') {
int n = size.charAt(0) - '0';
return sizes[n];
}
if (size.length() > 0 && size.charAt(0) == '-') {
if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
int n = size.charAt(1) - '0';
double x;
for (x = 1.0; n > 0; --n) {
x *= 0.8;
}
x *= 100.0;
buf = "" + (int) x + "%";
return buf;
}
return "smaller"; /* "70%"; */
}
if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
int n = size.charAt(1) - '0';
double x;
for (x = 1.0; n > 0; --n) {
x *= 1.2;
}
x *= 100.0;
buf = "" + (int) x + "%";
return buf;
}
return "larger"; /* "140%" */
}
private void addFontFace(Node node, String face) {
addStyleProperty(node, "font-family: " + face);
}
private void addFontSize(Node node, String size) {
String value;
if (size.equals("6") && node.tag == tt.tagP) {
node.element = "h1";
tt.findTag(node);
return;
}
if (size.equals("5") && node.tag == tt.tagP) {
node.element = "h2";
tt.findTag(node);
return;
}
if (size.equals("4") && node.tag == tt.tagP) {
node.element = "h3";
tt.findTag(node);
return;
}
value = fontSize2Name(size);
if (value != null) {
addStyleProperty(node, "font-size: " + value);
}
}
private void addFontColor(Node node, String color) {
addStyleProperty(node, "color: " + color);
}
private void addAlign(Node node, String align) {
/* force alignment value to lower case */
addStyleProperty(node, "text-align: " + align.toLowerCase());
}
/*
* add style properties to node corresponding to the font face, size and
* color attributes
*/
private void addFontStyles(Node node, AttVal av) {
while (av != null) {
if (av.attribute.equals("face")) {
addFontFace(node, av.value);
} else if (av.attribute.equals("size")) {
addFontSize(node, av.value);
} else if (av.attribute.equals("color")) {
addFontColor(node, av.value);
}
av = av.next;
}
}
/*
* Symptom: <p align=center> Action: <p style="text-align: center">
*/
private void textAlign(Lexer lexer, Node node) {
AttVal av, prev;
prev = null;
for (av = node.attributes; av != null; av = av.next) {
if (av.attribute.equals("align")) {
if (prev != null) {
prev.next = av.next;
} else {
node.attributes = av.next;
}
if (av.value != null) {
addAlign(node, av.value);
}
break;
}
prev = av;
}
}
/*
* The clean up rules use the pnode argument to return the next node when
* the orignal node has been deleted
*/
/*
* Symptom: <dir> <li> where <li> is only child Action: coerce <dir> <li> to
* <div> with indent.
*/
private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) {
Node child;
if (node.tag == tt.tagDir || node.tag == tt.tagUl || node.tag == tt.tagOl) {
child = node.content;
if (child == null) {
return false;
}
/* check child has no peers */
if (child.next != null) {
return false;
}
if (child.tag != tt.tagLi) {
return false;
}
if (!child.implicit) {
return false;
}
/* coerce dir to div */
node.tag = tt.tagDiv;
node.element = "div";
addStyleProperty(node, "margin-left: 2em");
stripOnlyChild(node);
return true;
//#if 0
//Node content;
//Node last;
//content = child.content;
//last = child.last;
//child.content = null;
/* adjust parent and set margin on contents of <li> */
//for (child = content; child != null; child = child.next)
//{
// child.parent = node.parent;
// addStyleProperty(child, "margin-left: 1em");
//}
/* hook first/last into sequence */
//if (content != null)
//{
// content.prev = node.prev;
// last.next = node.next;
// fixNodeLinks(content);
// fixNodeLinks(last);
//}
//node.next = null;
/* ensure that new node is cleaned */
//pnode.setObject(cleanNode(lexer, content));
//return true;
//#endif
}
return false;
}
/*
* Symptom: <center> Action: replace <center> by <div
* style="text-align: center">
*/
private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) {
if (node.tag == tt.tagCenter) {
if (lexer.configuration.DropFontTags) {
if (node.content != null) {
Node last = node.last;
Node parent = node.parent;
discardContainer(node, pnode);
node = lexer.inferredTag("br");
if (last.next != null) {
last.next.prev = node;
}
node.next = last.next;
last.next = node;
node.prev = last;
if (parent.last == last) {
parent.last = node;
}
node.parent = parent;
} else {
Node prev = node.prev;
Node next = node.next;
Node parent = node.parent;
discardContainer(node, pnode);
node = lexer.inferredTag("br");
node.next = next;
node.prev = prev;
node.parent = parent;
if (next != null) {
next.prev = node;
} else {
parent.last = node;
}
if (prev != null) {
prev.next = node;
} else {
parent.content = node;
}
}
return true;
}
node.tag = tt.tagDiv;
node.element = "div";
addStyleProperty(node, "text-align: center");
return true;
}
return false;
}
/*
* Symptom <div><div>...</div></div> Action: merge the two divs This is
* useful after nested <dir>s used by Word for indenting have been converted
* to <div>s
*/
private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) {
Node child;
if (node.tag != tt.tagDiv) {
return false;
}
child = node.content;
if (child == null) {
return false;
}
if (child.tag != tt.tagDiv) {
return false;
}
if (child.next != null) {
return false;
}
mergeStyles(node, child);
stripOnlyChild(node);
return true;
}
/*
* Symptom: <ul><li><ul>...</ul></li></ul> Action: discard outer list
*/
private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) {
Node child, list;
if (node.tag == tt.tagUl || node.tag == tt.tagOl) {
child = node.content;
if (child == null) {
return false;
}
/* check child has no peers */
if (child.next != null) {
return false;
}
list = child.content;
if (list == null) {
return false;
}
if (list.tag != node.tag) {
return false;
}
pnode.setObject(node.next);
/* move inner list node into position of outer node */
list.prev = node.prev;
list.next = node.next;
list.parent = node.parent;
fixNodeLinks(list);
/* get rid of outer ul and its li */
child.content = null;
node.content = null;
node.next = null;
/*
* If prev node was a list the chances are this node should be
* appended to that list. Word has no way of recognizing nested
* lists and just uses indents
*/
if (list.prev != null) {
node = list;
list = node.prev;
if (list.tag == tt.tagUl || list.tag == tt.tagOl) {
list.next = node.next;
if (list.next != null) {
list.next.prev = list;
}
child = list.last; /* <li> */
node.parent = child;
node.next = null;
node.prev = child.last;
fixNodeLinks(node);
}
}
cleanNode(lexer, node);
return true;
}
return false;
}
/*
* Symptom: the only child of a block-level element is a presentation
* element such as B, I or FONT Action: add style "font-weight: bold" to the
* block and strip the <b> element, leaving its children. example: <p>
* <b><font face="Arial" size="6">Draft Recommended Practice</font></b> </p>
* becomes: <p style="font-weight: bold; font-family: Arial; font-size: 6">
* Draft Recommended Practice </p> This code also replaces the align
* attribute by a style attribute. However, to avoid CSS problems with
* Navigator 4, this isn't done for the elements: caption, tr and table
*/
private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) {
Node child;
if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) {
/* check for align attribute */
if (node.tag != tt.tagCaption) {
textAlign(lexer, node);
}
child = node.content;
if (child == null) {
return false;
}
/* check child has no peers */
if (child.next != null) {
return false;
}
if (child.tag == tt.tagB) {
mergeStyles(node, child);
addStyleProperty(node, "font-weight: bold");
stripOnlyChild(node);
return true;
}
if (child.tag == tt.tagI) {
mergeStyles(node, child);
addStyleProperty(node, "font-style: italic");
stripOnlyChild(node);
return true;
}
if (child.tag == tt.tagFont) {
mergeStyles(node, child);
addFontStyles(node, child.attributes);
stripOnlyChild(node);
return true;
}
}
}
return false;
}
/* the only child of table cell or an inline element such as em */
private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) {
Node child;
if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
child = node.content;
if (child == null) {
return false;
}
/* check child has no peers */
if (child.next != null) {
return false;
}
if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) {
mergeStyles(node, child);
addStyleProperty(node, "font-weight: bold");
stripOnlyChild(node);
return true;
}
if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) {
mergeStyles(node, child);
addStyleProperty(node, "font-style: italic");
stripOnlyChild(node);
return true;
}
if (child.tag == tt.tagFont) {
mergeStyles(node, child);
addFontStyles(node, child.attributes);
stripOnlyChild(node);
return true;
}
}
return false;
}
/*
* Replace font elements by span elements, deleting the font element's
* attributes and replacing them by a single style attribute.
*/
private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) {
AttVal av, style, next;
if (node.tag == tt.tagFont) {
if (lexer.configuration.DropFontTags) {
discardContainer(node, pnode);
return false;
}
/* if FONT is only child of parent element then leave alone */
if (node.parent.content == node && node.next == null) {
return false;
}
addFontStyles(node, node.attributes);
/* extract style attribute and free the rest */
av = node.attributes;
style = null;
while (av != null) {
next = av.next;
if (av.attribute.equals("style")) {
av.next = null;
style = av;
}
av = next;
}
node.attributes = style;
node.tag = tt.tagSpan;
node.element = "span";
return true;
}
return false;
}
/*
* Applies all matching rules to a node.
*/
private Node cleanNode(Lexer lexer, Node node) {
Node next = null;
MutableObject o = new MutableObject();
boolean b = false;
for (next = node; node.isElement(); node = next) {
o.setObject(next);
b = dir2Div(lexer, node, o);
next = (Node) o.getObject();
if (b) {
continue;
}
b = nestedList(lexer, node, o);
next = (Node) o.getObject();
if (b) {
continue;
}
b = center2Div(lexer, node, o);
next = (Node) o.getObject();
if (b) {
continue;
}
b = mergeDivs(lexer, node, o);
next = (Node) o.getObject();
if (b) {
continue;
}
b = blockStyle(lexer, node, o);
next = (Node) o.getObject();
if (b) {
continue;
}
b = inlineStyle(lexer, node, o);
next = (Node) o.getObject();
if (b) {
continue;
}
b = font2Span(lexer, node, o);
next = (Node) o.getObject();
if (b) {
continue;
}
break;
}
return next;
}
private Node createStyleProperties(Lexer lexer, Node node) {
Node child;
if (node.content != null) {
for (child = node.content; child != null; child = child.next) {
child = createStyleProperties(lexer, child);
}
}
return cleanNode(lexer, node);
}
private void defineStyleRules(Lexer lexer, Node node) {
Node child;
if (node.content != null) {
for (child = node.content; child != null; child = child.next) {
defineStyleRules(lexer, child);
}
}
style2Rule(lexer, node);
}
public void cleanTree(Lexer lexer, Node doc) {
doc = createStyleProperties(lexer, doc);
if (!lexer.configuration.MakeClean) {
defineStyleRules(lexer, doc);
createStyleElement(lexer, doc);
}
}
/* simplifies <b><b> ... </b> ...</b> etc. */
public void nestedEmphasis(Node node) {
MutableObject o = new MutableObject();
Node next;
while (node != null) {
next = node.next;
if ((node.tag == tt.tagB || node.tag == tt.tagI) && node.parent != null && node.parent.tag == node.tag) {
/* strip redundant inner element */
o.setObject(next);
discardContainer(node, o);
next = (Node) o.getObject();
node = next;
continue;
}
if (node.content != null) {
nestedEmphasis(node.content);
}
node = next;
}
}
/* replace i by em and b by strong */
public void emFromI(Node node) {
while (node != null) {
if (node.tag == tt.tagI) {
node.element = tt.tagEm.name;
node.tag = tt.tagEm;
} else if (node.tag == tt.tagB) {
node.element = tt.tagStrong.name;
node.tag = tt.tagStrong;
}
if (node.content != null) {
emFromI(node.content);
}
node = node.next;
}
}
/*
* Some people use dir or ul without an li to indent the content. The
* pattern to look for is a list with a single implicit li. This is
* recursively replaced by an implicit blockquote.
*/
public void list2BQ(Node node) {
while (node != null) {
if (node.content != null) {
list2BQ(node.content);
}
if (node.tag != null && node.tag.parser == ParserImpl.getParseList() && node.hasOneChild()
&& node.content.implicit) {
stripOnlyChild(node);
node.element = tt.tagBlockquote.name;
node.tag = tt.tagBlockquote;
node.implicit = true;
}
node = node.next;
}
}
/*
* Replace implicit blockquote by div with an indent taking care to reduce
* nested blockquotes to a single div with the indent set to match the
* nesting depth
*/
public void bQ2Div(Node node) {
int indent;
String indent_buf;
while (node != null) {
if (node.tag == tt.tagBlockquote && node.implicit) {
indent = 1;
while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) {
++indent;
stripOnlyChild(node);
}
if (node.content != null) {
bQ2Div(node.content);
}
indent_buf = "margin-left: " + new Integer(2 * indent).toString() + "em";
node.element = tt.tagDiv.name;
node.tag = tt.tagDiv;
node.addAttribute("style", indent_buf);
} else if (node.content != null) {
bQ2Div(node.content);
}
node = node.next;
}
}
/* node is <![if ...]> prune up to <![endif]> */
public Node pruneSection(Lexer lexer, Node node) {
for (; ; ) {
/* discard node and returns next */
node = Node.discardElement(node);
if (node == null) {
return null;
}
if (node.type == Node.SectionTag) {
if (Lexer.getString(node.textarray, node.start, 2).equals("if")) {
node = pruneSection(lexer, node);
continue;
}
if (Lexer.getString(node.textarray, node.start, 5).equals("endif")) {
node = Node.discardElement(node);
break;
}
}
}
return node;
}
public void dropSections(Lexer lexer, Node node) {
while (node != null) {
if (node.type == Node.SectionTag) {
/* prune up to matching endif */
if (Lexer.getString(node.textarray, node.start, 2).equals("if")) {
node = pruneSection(lexer, node);
continue;
}
/* discard others as well */
node = Node.discardElement(node);
continue;
}
if (node.content != null) {
dropSections(lexer, node.content);
}
node = node.next;
}
}
public void purgeAttributes(Node node) {
AttVal attr = node.attributes;
AttVal next = null;
AttVal prev = null;
while (attr != null) {
next = attr.next;
/* special check for class="Code" denoting pre text */
if (attr.attribute != null && attr.value != null && attr.attribute.equals("class")
&& attr.value.equals("Code")) {
prev = attr;
} else if (attr.attribute != null
&& (attr.attribute.equals("class") || attr.attribute.equals("style")
|| attr.attribute.equals("lang") || attr.attribute.startsWith("x:") || (attr.attribute
.equals("height") || attr.attribute.equals("width"))
&& (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh))) {
if (prev != null) {
prev.next = next;
} else {
node.attributes = next;
}
} else {
prev = attr;
}
attr = next;
}
}
/* Word2000 uses span excessively, so we strip span out */
public Node stripSpan(Lexer lexer, Node span) {
Node node;
Node prev = null;
Node content;
/*
* deal with span elements that have content by splicing the content in
* place of the span after having processed it
*/
cleanWord2000(lexer, span.content);
content = span.content;
if (span.prev != null) {
prev = span.prev;
} else if (content != null) {
node = content;
content = content.next;
Node.removeNode(node);
Node.insertNodeBeforeElement(span, node);
prev = node;
}
while (content != null) {
node = content;
content = content.next;
Node.removeNode(node);
Node.insertNodeAfterElement(prev, node);
prev = node;
}
if (span.next == null) {
span.parent.last = prev;
}
node = span.next;
span.content = null;
Node.discardElement(span);
return node;
}
/* map non-breaking spaces to regular spaces */
private void normalizeSpaces(Lexer lexer, Node node) {
while (node != null) {
if (node.content != null) {
normalizeSpaces(lexer, node.content);
}
if (node.type == Node.TextNode) {
int i;
MutableInteger c = new MutableInteger();
int p = node.start;
for (i = node.start; i < node.end; ++i) {
c.value = node.textarray[i];
/* look for UTF-8 multibyte character */
if (c.value > 0x7F) {
i += PPrint.getUTF8(node.textarray, i, c);
}
if (c.value == 160) {
c.value = ' ';
}
p = PPrint.putUTF8(node.textarray, p, c.value);
}
}
node = node.next;
}
}
/*
* This is a major clean up to strip out all the extra stuff you get when
* you save as web page from Word 2000. It doesn't yet know what to do with
* VML tags, but these will appear as errors unless you declare them as new
* tags, such as o:p which needs to be declared as inline.
*/
public void cleanWord2000(Lexer lexer, Node node) {
/* used to a list from a sequence of bulletted p's */
Node list = null;
while (node != null) {
/* discard Word's style verbiage */
if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) {
node = Node.discardElement(node);
continue;
}
/* strip out all span tags Word scatters so liberally! */
if (node.tag == tt.tagSpan) {
node = stripSpan(lexer, node);
continue;
}
/* get rid of Word's xmlns attributes */
if (node.tag == tt.tagHtml) {
/* check that it's a Word 2000 document */
if (node.getAttrByName("xmlns:o") == null) {
return;
}
}
if (node.tag == tt.tagLink) {
AttVal attr = node.getAttrByName("rel");
if (attr != null && attr.value != null && attr.value.equals("File-List")) {
node = Node.discardElement(node);
continue;
}
}
/* discard empty paragraphs */
if (node.content == null && node.tag == tt.tagP) {
node = Node.discardElement(node);
continue;
}
if (node.tag == tt.tagP) {
AttVal attr = node.getAttrByName("class");
/* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) {
Node.coerceNode(lexer, node, tt.tagLi);
if (list == null || list.tag != tt.tagUl) {
list = lexer.inferredTag("ul");
Node.insertNodeBeforeElement(node, list);
}
purgeAttributes(node);
if (node.content != null) {
cleanWord2000(lexer, node.content);
}
/* remove node and append to contents of list */
Node.removeNode(node);
Node.insertNodeAtEnd(list, node);
node = list.next;
}
/* map sequence of <p class="Code"> to <pre>...</pre> */
else if (attr != null && attr.value != null && attr.value.equals("Code")) {
Node br = lexer.newLineNode();
normalizeSpaces(lexer, node);
if (list == null || list.tag != tt.tagPre) {
list = lexer.inferredTag("pre");
Node.insertNodeBeforeElement(node, list);
}
/* remove node and append to contents of list */
Node.removeNode(node);
Node.insertNodeAtEnd(list, node);
stripSpan(lexer, node);
Node.insertNodeAtEnd(list, br);
node = list.next;
} else {
list = null;
}
} else {
list = null;
}
/* strip out style and class attributes */
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
purgeAttributes(node);
}
if (node.content != null) {
cleanWord2000(lexer, node.content);
}
node = node.next;
}
}
public boolean isWord2000(Node root, TagTable tt) {
Node html = root.findHTML(tt);
return html != null && html.getAttrByName("xmlns:o") != null;
}
}