Clean.java example

Explorer
citrus-tool-master
/*
 * Copyright 2010 Alibaba Group Holding Limited.
 * All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/*
 * @(#)Clean.java   1.11 2000/08/16
 *
 */

package org.w3c.tidy;

/**
 * Clean up misuse of presentation markup (c) 1998-2000 (W3C) MIT, INRIA, Keio
 * University See Tidy.java for the copyright notice. Derived from <a
 * href="http://www.w3.org/People/Raggett/tidy"> HTML Tidy Release 4 Aug
 * 2000</a>
 *
 * @author Dave Raggett <dsr@w3.org>
 * @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
 * @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
 */

/*
 * Filters from other formats such as Microsoft Word often make excessive use of
 * presentation markup such as font tags, B, I, and the align attribute. By
 * applying a set of production rules, it is straight forward to transform this
 * to use CSS. Some rules replace some of the children of an element by style
 * properties on the element, e.g. <p><b>...</b></p> -> <p
 * style="font-weight: bold">...</p> Such rules are applied to the element's
 * content and then to the element itself until none of the rules more apply.
 * Having applied all the rules to an element, it will have a style attribute
 * with one or more properties. Other rules strip the element they apply to,
 * replacing it by style properties on the contents, e.g.
 * <dir><li><p>...</li></dir> -> <p style="margin-left 1em">... These rules are
 * applied to an element before processing its content and replace the current
 * element by the first element in the exposed content. After applying both sets
 * of rules, you can replace the style attribute by a class value and style rule
 * in the document head. To support this, an association of styles and class
 * names is built. A naive approach is to rely on string matching to test when
 * two property lists are the same. A better approach would be to first sort the
 * properties before matching.
 */

public class Clean {

    private int classNum = 1;

    private TagTable tt;

    public Clean(TagTable tt) {
        this.tt = tt;
    }

    private StyleProp insertProperty(StyleProp props, String name, String value) {
        StyleProp first, prev, prop;
        int cmp;

        prev = null;
        first = props;

        while (props != null) {
            cmp = props.name.compareTo(name);

            if (cmp == 0) {
                /* this property is already defined, ignore new value */
                return first;
            }

            if (cmp > 0) // props.name > name
            {
                /* insert before this */

                prop = new StyleProp(name, value, props);

                if (prev != null) {
                    prev.next = prop;
                } else {
                    first = prop;
                }

                return first;
            }

            prev = props;
            props = props.next;
        }

        prop = new StyleProp(name, value);

        if (prev != null) {
            prev.next = prop;
        } else {
            first = prop;
        }

        return first;
    }

    /*
     * Create sorted linked list of properties from style string It temporarily
     * places nulls in place of ':' and ';' to delimit the strings for the
     * property name and value. Some systems don't allow you to null literal
     * strings, so to avoid this, a copy is made first.
     */
    private StyleProp createProps(StyleProp prop, String style) {
        int name_end;
        int value_end;
        int value_start = 0;
        int name_start = 0;
        boolean more;

        name_start = 0;
        while (name_start < style.length()) {
            while (name_start < style.length() && style.charAt(name_start) == ' ') {
                ++name_start;
            }

            name_end = name_start;

            while (name_end < style.length()) {
                if (style.charAt(name_end) == ':') {
                    value_start = name_end + 1;
                    break;
                }

                ++name_end;
            }

            if (name_end >= style.length() || style.charAt(name_end) != ':') {
                break;
            }

            while (value_start < style.length() && style.charAt(value_start) == ' ') {
                ++value_start;
            }

            value_end = value_start;
            more = false;

            while (value_end < style.length()) {
                if (style.charAt(value_end) == ';') {
                    more = true;
                    break;
                }

                ++value_end;
            }

            prop = insertProperty(prop, style.substring(name_start, name_end), style.substring(value_start, value_end));

            if (more) {
                name_start = value_end + 1;
                continue;
            }

            break;
        }

        return prop;
    }

    private String createPropString(StyleProp props) {
        String style = "";
        int len;
        StyleProp prop;

        /* compute length */

        for (len = 0, prop = props; prop != null; prop = prop.next) {
            len += prop.name.length() + 2;
            len += prop.value.length() + 2;
        }

        for (prop = props; prop != null; prop = prop.next) {
            style = style.concat(prop.name);
            style = style.concat(": ");

            style = style.concat(prop.value);

            if (prop.next == null) {
                break;
            }

            style = style.concat("; ");
        }

        return style;
    }

    /*
     * create string with merged properties
     */
    private String addProperty(String style, String property) {
        StyleProp prop;

        prop = createProps(null, style);
        prop = createProps(prop, property);
        style = createPropString(prop);
        return style;
    }

    private String gensymClass(String tag) {
        String str;

        str = "c" + classNum;
        classNum++;
        return str;
    }

    private String findStyle(Lexer lexer, String tag, String properties) {
        Style style;

        for (style = lexer.styles; style != null; style = style.next) {
            if (style.tag.equals(tag) && style.properties.equals(properties)) {
                return style.tagClass;
            }
        }

        style = new Style(tag, gensymClass(tag), properties, lexer.styles);
        lexer.styles = style;
        return style.tagClass;
    }

    /*
     * Find style attribute in node, and replace it by corresponding class
     * attribute. Search for class in style dictionary otherwise gensym new
     * class and add to dictionary. Assumes that node doesn't have a class
     * attribute
     */
    private void style2Rule(Lexer lexer, Node node) {
        AttVal styleattr, classattr;
        String classname;

        styleattr = node.getAttrByName("style");

        if (styleattr != null) {
            classname = findStyle(lexer, node.element, styleattr.value);
            classattr = node.getAttrByName("class");

            /*
             * if there already is a class attribute then append class name
             * after a space
             */
            if (classattr != null) {
                classattr.value = classattr.value + " " + classname;
                node.removeAttribute(styleattr);
            } else /* reuse style attribute for class attribute */ {
                styleattr.attribute = "class";
                styleattr.value = classname;
            }
        }
    }

    private void addColorRule(Lexer lexer, String selector, String color) {
        if (color != null) {
            lexer.addStringLiteral(selector);
            lexer.addStringLiteral(" { color: ");
            lexer.addStringLiteral(color);
            lexer.addStringLiteral(" }\n");
        }
    }

    /*
     * move presentation attribs from body to style element background="foo" ->
     * body { background-image: url(foo) } bgcolor="foo" -> body {
     * background-color: foo } text="foo" -> body { color: foo } link="foo" ->
     * :link { color: foo } vlink="foo" -> :visited { color: foo } alink="foo"
     * -> :active { color: foo }
     */
    private void cleanBodyAttrs(Lexer lexer, Node body) {
        AttVal attr;
        String bgurl = null;
        String bgcolor = null;
        String color = null;

        attr = body.getAttrByName("background");

        if (attr != null) {
            bgurl = attr.value;
            attr.value = null;
            body.removeAttribute(attr);
        }

        attr = body.getAttrByName("bgcolor");

        if (attr != null) {
            bgcolor = attr.value;
            attr.value = null;
            body.removeAttribute(attr);
        }

        attr = body.getAttrByName("text");

        if (attr != null) {
            color = attr.value;
            attr.value = null;
            body.removeAttribute(attr);
        }

        if (bgurl != null || bgcolor != null || color != null) {
            lexer.addStringLiteral(" body {\n");

            if (bgurl != null) {
                lexer.addStringLiteral("  background-image: url(");
                lexer.addStringLiteral(bgurl);
                lexer.addStringLiteral(");\n");
            }

            if (bgcolor != null) {
                lexer.addStringLiteral("  background-color: ");
                lexer.addStringLiteral(bgcolor);
                lexer.addStringLiteral(";\n");
            }

            if (color != null) {
                lexer.addStringLiteral("  color: ");
                lexer.addStringLiteral(color);
                lexer.addStringLiteral(";\n");
            }

            lexer.addStringLiteral(" }\n");
        }

        attr = body.getAttrByName("link");

        if (attr != null) {
            addColorRule(lexer, " :link", attr.value);
            body.removeAttribute(attr);
        }

        attr = body.getAttrByName("vlink");

        if (attr != null) {
            addColorRule(lexer, " :visited", attr.value);
            body.removeAttribute(attr);
        }

        attr = body.getAttrByName("alink");

        if (attr != null) {
            addColorRule(lexer, " :active", attr.value);
            body.removeAttribute(attr);
        }
    }

    private boolean niceBody(Lexer lexer, Node doc) {
        Node body = doc.findBody(lexer.configuration.tt);

        if (body != null) {
            if (body.getAttrByName("background") != null || body.getAttrByName("bgcolor") != null
                || body.getAttrByName("text") != null || body.getAttrByName("link") != null
                || body.getAttrByName("vlink") != null || body.getAttrByName("alink") != null) {
                lexer.badLayout |= Report.USING_BODY;
                return false;
            }
        }

        return true;
    }

    /* create style element using rules from dictionary */
    private void createStyleElement(Lexer lexer, Node doc) {
        Node node, head, body;
        Style style;
        AttVal av;

        if (lexer.styles == null && niceBody(lexer, doc)) {
            return;
        }

        node = lexer.newNode(Node.StartTag, null, 0, 0, "style");
        node.implicit = true;

        /* insert type attribute */
        av = new AttVal(null, null, '"', "type", "text/css");
        av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
        node.attributes = av;

        body = doc.findBody(lexer.configuration.tt);

        lexer.txtstart = lexer.lexsize;

        if (body != null) {
            cleanBodyAttrs(lexer, body);
        }

        for (style = lexer.styles; style != null; style = style.next) {
            lexer.addCharToLexer(' ');
            lexer.addStringLiteral(style.tag);
            lexer.addCharToLexer('.');
            lexer.addStringLiteral(style.tagClass);
            lexer.addCharToLexer(' ');
            lexer.addCharToLexer('{');
            lexer.addStringLiteral(style.properties);
            lexer.addCharToLexer('}');
            lexer.addCharToLexer('\n');
        }

        lexer.txtend = lexer.lexsize;

        Node.insertNodeAtEnd(node, lexer.newNode(Node.TextNode, lexer.lexbuf, lexer.txtstart, lexer.txtend));

        /*
         * now insert style element into document head doc is root node. search
         * its children for html node the head node should be first child of
         * html node
         */

        head = doc.findHEAD(lexer.configuration.tt);

        if (head != null) {
            Node.insertNodeAtEnd(head, node);
        }
    }

    /* ensure bidirectional links are consistent */
    private void fixNodeLinks(Node node) {
        Node child;

        if (node.prev != null) {
            node.prev.next = node;
        } else {
            node.parent.content = node;
        }

        if (node.next != null) {
            node.next.prev = node;
        } else {
            node.parent.last = node;
        }

        for (child = node.content; child != null; child = child.next) {
            child.parent = node;
        }
    }

    /*
     * used to strip child of node when the node has one and only one child
     */
    private void stripOnlyChild(Node node) {
        Node child;

        child = node.content;
        node.content = child.content;
        node.last = child.last;
        child.content = null;

        for (child = node.content; child != null; child = child.next) {
            child.parent = node;
        }
    }

    /* used to strip font start and end tags */
    private void discardContainer(Node element, MutableObject pnode) {
        Node node;
        Node parent = element.parent;

        if (element.content != null) {
            element.last.next = element.next;

            if (element.next != null) {
                element.next.prev = element.last;
                element.last.next = element.next;
            } else {
                parent.last = element.last;
            }

            if (element.prev != null) {
                element.content.prev = element.prev;
                element.prev.next = element.content;
            } else {
                parent.content = element.content;
            }

            for (node = element.content; node != null; node = node.next) {
                node.parent = parent;
            }

            pnode.setObject(element.content);
        } else {
            if (element.next != null) {
                element.next.prev = element.prev;
            } else {
                parent.last = element.prev;
            }

            if (element.prev != null) {
                element.prev.next = element.next;
            } else {
                parent.content = element.next;
            }

            pnode.setObject(element.next);
        }

        element.next = null;
        element.content = null;
    }

    /*
     * Add style property to element, creating style attribute as needed and
     * adding ; delimiter
     */
    private void addStyleProperty(Node node, String property) {
        AttVal av;

        for (av = node.attributes; av != null; av = av.next) {
            if (av.attribute.equals("style")) {
                break;
            }
        }

        /* if style attribute already exists then insert property */

        if (av != null) {
            String s;

            s = addProperty(av.value, property);
            av.value = s;
        } else /* else create new style attribute */ {
            av = new AttVal(node.attributes, null, '"', "style", property);
            av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
            node.attributes = av;
        }
    }

    /*
     * Create new string that consists of the combined style properties in s1
     * and s2 To merge property lists, we build a linked list of property/values
     * and insert properties into the list in order, merging values for the same
     * property name.
     */
    private String mergeProperties(String s1, String s2) {
        String s;
        StyleProp prop;

        prop = createProps(null, s1);
        prop = createProps(prop, s2);
        s = createPropString(prop);
        return s;
    }

    private void mergeStyles(Node node, Node child) {
        AttVal av;
        String s1, s2, style;

        for (s2 = null, av = child.attributes; av != null; av = av.next) {
            if (av.attribute.equals("style")) {
                s2 = av.value;
                break;
            }
        }

        for (s1 = null, av = node.attributes; av != null; av = av.next) {
            if (av.attribute.equals("style")) {
                s1 = av.value;
                break;
            }
        }

        if (s1 != null) {
            if (s2 != null) /* merge styles from both */ {
                style = mergeProperties(s1, s2);
                av.value = style;
            }
        } else if (s2 != null) /* copy style of child */ {
            av = new AttVal(node.attributes, null, '"', "style", s2);
            av.dict = AttributeTable.getDefaultAttributeTable().findAttribute(av);
            node.attributes = av;
        }
    }

    private String fontSize2Name(String size) {
        /*
         * String[] sizes = { "50%", "60%", "80%", null, "120%", "150%", "200%"
         * };
         */

        String[] sizes = { "60%", "70%", "80%", null, "120%", "150%", "200%" };
        String buf;

        if (size.length() > 0 && '0' <= size.charAt(0) && size.charAt(0) <= '6') {
            int n = size.charAt(0) - '0';
            return sizes[n];
        }

        if (size.length() > 0 && size.charAt(0) == '-') {
            if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
                int n = size.charAt(1) - '0';
                double x;

                for (x = 1.0; n > 0; --n) {
                    x *= 0.8;
                }

                x *= 100.0;
                buf = "" + (int) x + "%";

                return buf;
            }

            return "smaller"; /* "70%"; */
        }

        if (size.length() > 1 && '0' <= size.charAt(1) && size.charAt(1) <= '6') {
            int n = size.charAt(1) - '0';
            double x;

            for (x = 1.0; n > 0; --n) {
                x *= 1.2;
            }

            x *= 100.0;
            buf = "" + (int) x + "%";

            return buf;
        }

        return "larger"; /* "140%" */
    }

    private void addFontFace(Node node, String face) {
        addStyleProperty(node, "font-family: " + face);
    }

    private void addFontSize(Node node, String size) {
        String value;

        if (size.equals("6") && node.tag == tt.tagP) {
            node.element = "h1";
            tt.findTag(node);
            return;
        }

        if (size.equals("5") && node.tag == tt.tagP) {
            node.element = "h2";
            tt.findTag(node);
            return;
        }

        if (size.equals("4") && node.tag == tt.tagP) {
            node.element = "h3";
            tt.findTag(node);
            return;
        }

        value = fontSize2Name(size);

        if (value != null) {
            addStyleProperty(node, "font-size: " + value);
        }
    }

    private void addFontColor(Node node, String color) {
        addStyleProperty(node, "color: " + color);
    }

    private void addAlign(Node node, String align) {
        /* force alignment value to lower case */
        addStyleProperty(node, "text-align: " + align.toLowerCase());
    }

    /*
     * add style properties to node corresponding to the font face, size and
     * color attributes
     */
    private void addFontStyles(Node node, AttVal av) {
        while (av != null) {
            if (av.attribute.equals("face")) {
                addFontFace(node, av.value);
            } else if (av.attribute.equals("size")) {
                addFontSize(node, av.value);
            } else if (av.attribute.equals("color")) {
                addFontColor(node, av.value);
            }

            av = av.next;
        }
    }

    /*
     * Symptom: <p align=center> Action: <p style="text-align: center">
     */
    private void textAlign(Lexer lexer, Node node) {
        AttVal av, prev;

        prev = null;

        for (av = node.attributes; av != null; av = av.next) {
            if (av.attribute.equals("align")) {
                if (prev != null) {
                    prev.next = av.next;
                } else {
                    node.attributes = av.next;
                }

                if (av.value != null) {
                    addAlign(node, av.value);
                }

                break;
            }

            prev = av;
        }
    }

    /*
     * The clean up rules use the pnode argument to return the next node when
     * the orignal node has been deleted
     */

    /*
     * Symptom: <dir> <li> where <li> is only child Action: coerce <dir> <li> to
     * <div> with indent.
     */

    private boolean dir2Div(Lexer lexer, Node node, MutableObject pnode) {
        Node child;

        if (node.tag == tt.tagDir || node.tag == tt.tagUl || node.tag == tt.tagOl) {
            child = node.content;

            if (child == null) {
                return false;
            }

            /* check child has no peers */

            if (child.next != null) {
                return false;
            }

            if (child.tag != tt.tagLi) {
                return false;
            }

            if (!child.implicit) {
                return false;
            }

            /* coerce dir to div */

            node.tag = tt.tagDiv;
            node.element = "div";
            addStyleProperty(node, "margin-left: 2em");
            stripOnlyChild(node);
            return true;

            //#if 0
            //Node content;
            //Node last;
            //content = child.content;
            //last = child.last;
            //child.content = null;

            /* adjust parent and set margin on contents of <li> */

            //for (child = content; child != null; child = child.next)
            //{
            //    child.parent = node.parent;
            //    addStyleProperty(child, "margin-left: 1em");
            //}
            /* hook first/last into sequence */

            //if (content != null)
            //{
            //    content.prev = node.prev;
            //    last.next = node.next;
            //    fixNodeLinks(content);
            //    fixNodeLinks(last);
            //}
            //node.next = null;
            /* ensure that new node is cleaned */
            //pnode.setObject(cleanNode(lexer, content));
            //return true;
            //#endif
        }

        return false;
    }

    /*
     * Symptom: <center> Action: replace <center> by <div
     * style="text-align: center">
     */

    private boolean center2Div(Lexer lexer, Node node, MutableObject pnode) {
        if (node.tag == tt.tagCenter) {
            if (lexer.configuration.DropFontTags) {
                if (node.content != null) {
                    Node last = node.last;
                    Node parent = node.parent;

                    discardContainer(node, pnode);

                    node = lexer.inferredTag("br");

                    if (last.next != null) {
                        last.next.prev = node;
                    }

                    node.next = last.next;
                    last.next = node;
                    node.prev = last;

                    if (parent.last == last) {
                        parent.last = node;
                    }

                    node.parent = parent;
                } else {
                    Node prev = node.prev;
                    Node next = node.next;
                    Node parent = node.parent;
                    discardContainer(node, pnode);

                    node = lexer.inferredTag("br");
                    node.next = next;
                    node.prev = prev;
                    node.parent = parent;

                    if (next != null) {
                        next.prev = node;
                    } else {
                        parent.last = node;
                    }

                    if (prev != null) {
                        prev.next = node;
                    } else {
                        parent.content = node;
                    }
                }

                return true;
            }
            node.tag = tt.tagDiv;
            node.element = "div";
            addStyleProperty(node, "text-align: center");
            return true;
        }

        return false;
    }

    /*
     * Symptom <div><div>...</div></div> Action: merge the two divs This is
     * useful after nested <dir>s used by Word for indenting have been converted
     * to <div>s
     */
    private boolean mergeDivs(Lexer lexer, Node node, MutableObject pnode) {
        Node child;

        if (node.tag != tt.tagDiv) {
            return false;
        }

        child = node.content;

        if (child == null) {
            return false;
        }

        if (child.tag != tt.tagDiv) {
            return false;
        }

        if (child.next != null) {
            return false;
        }

        mergeStyles(node, child);
        stripOnlyChild(node);
        return true;
    }

    /*
     * Symptom: <ul><li><ul>...</ul></li></ul> Action: discard outer list
     */

    private boolean nestedList(Lexer lexer, Node node, MutableObject pnode) {
        Node child, list;

        if (node.tag == tt.tagUl || node.tag == tt.tagOl) {
            child = node.content;

            if (child == null) {
                return false;
            }

            /* check child has no peers */

            if (child.next != null) {
                return false;
            }

            list = child.content;

            if (list == null) {
                return false;
            }

            if (list.tag != node.tag) {
                return false;
            }

            pnode.setObject(node.next);

            /* move inner list node into position of outer node */
            list.prev = node.prev;
            list.next = node.next;
            list.parent = node.parent;
            fixNodeLinks(list);

            /* get rid of outer ul and its li */
            child.content = null;
            node.content = null;
            node.next = null;

            /*
             * If prev node was a list the chances are this node should be
             * appended to that list. Word has no way of recognizing nested
             * lists and just uses indents
             */

            if (list.prev != null) {
                node = list;
                list = node.prev;

                if (list.tag == tt.tagUl || list.tag == tt.tagOl) {
                    list.next = node.next;

                    if (list.next != null) {
                        list.next.prev = list;
                    }

                    child = list.last; /* <li> */

                    node.parent = child;
                    node.next = null;
                    node.prev = child.last;
                    fixNodeLinks(node);
                }
            }

            cleanNode(lexer, node);
            return true;
        }

        return false;
    }

    /*
     * Symptom: the only child of a block-level element is a presentation
     * element such as B, I or FONT Action: add style "font-weight: bold" to the
     * block and strip the <b> element, leaving its children. example: <p>
     * <b><font face="Arial" size="6">Draft Recommended Practice</font></b> </p>
     * becomes: <p style="font-weight: bold; font-family: Arial; font-size: 6">
     * Draft Recommended Practice </p> This code also replaces the align
     * attribute by a style attribute. However, to avoid CSS problems with
     * Navigator 4, this isn't done for the elements: caption, tr and table
     */
    private boolean blockStyle(Lexer lexer, Node node, MutableObject pnode) {
        Node child;

        if ((node.tag.model & (Dict.CM_BLOCK | Dict.CM_LIST | Dict.CM_DEFLIST | Dict.CM_TABLE)) != 0) {
            if (node.tag != tt.tagTable && node.tag != tt.tagTr && node.tag != tt.tagLi) {
                /* check for align attribute */
                if (node.tag != tt.tagCaption) {
                    textAlign(lexer, node);
                }

                child = node.content;

                if (child == null) {
                    return false;
                }

                /* check child has no peers */

                if (child.next != null) {
                    return false;
                }

                if (child.tag == tt.tagB) {
                    mergeStyles(node, child);
                    addStyleProperty(node, "font-weight: bold");
                    stripOnlyChild(node);
                    return true;
                }

                if (child.tag == tt.tagI) {
                    mergeStyles(node, child);
                    addStyleProperty(node, "font-style: italic");
                    stripOnlyChild(node);
                    return true;
                }

                if (child.tag == tt.tagFont) {
                    mergeStyles(node, child);
                    addFontStyles(node, child.attributes);
                    stripOnlyChild(node);
                    return true;
                }
            }
        }

        return false;
    }

    /* the only child of table cell or an inline element such as em */
    private boolean inlineStyle(Lexer lexer, Node node, MutableObject pnode) {
        Node child;

        if (node.tag != tt.tagFont && (node.tag.model & (Dict.CM_INLINE | Dict.CM_ROW)) != 0) {
            child = node.content;

            if (child == null) {
                return false;
            }

            /* check child has no peers */

            if (child.next != null) {
                return false;
            }

            if (child.tag == tt.tagB && lexer.configuration.LogicalEmphasis) {
                mergeStyles(node, child);
                addStyleProperty(node, "font-weight: bold");
                stripOnlyChild(node);
                return true;
            }

            if (child.tag == tt.tagI && lexer.configuration.LogicalEmphasis) {
                mergeStyles(node, child);
                addStyleProperty(node, "font-style: italic");
                stripOnlyChild(node);
                return true;
            }

            if (child.tag == tt.tagFont) {
                mergeStyles(node, child);
                addFontStyles(node, child.attributes);
                stripOnlyChild(node);
                return true;
            }
        }

        return false;
    }

    /*
     * Replace font elements by span elements, deleting the font element's
     * attributes and replacing them by a single style attribute.
     */
    private boolean font2Span(Lexer lexer, Node node, MutableObject pnode) {
        AttVal av, style, next;

        if (node.tag == tt.tagFont) {
            if (lexer.configuration.DropFontTags) {
                discardContainer(node, pnode);
                return false;
            }

            /* if FONT is only child of parent element then leave alone */
            if (node.parent.content == node && node.next == null) {
                return false;
            }

            addFontStyles(node, node.attributes);

            /* extract style attribute and free the rest */
            av = node.attributes;
            style = null;

            while (av != null) {
                next = av.next;

                if (av.attribute.equals("style")) {
                    av.next = null;
                    style = av;
                }

                av = next;
            }

            node.attributes = style;

            node.tag = tt.tagSpan;
            node.element = "span";

            return true;
        }

        return false;
    }

    /*
     * Applies all matching rules to a node.
     */
    private Node cleanNode(Lexer lexer, Node node) {
        Node next = null;
        MutableObject o = new MutableObject();
        boolean b = false;

        for (next = node; node.isElement(); node = next) {
            o.setObject(next);

            b = dir2Div(lexer, node, o);
            next = (Node) o.getObject();
            if (b) {
                continue;
            }

            b = nestedList(lexer, node, o);
            next = (Node) o.getObject();
            if (b) {
                continue;
            }

            b = center2Div(lexer, node, o);
            next = (Node) o.getObject();
            if (b) {
                continue;
            }

            b = mergeDivs(lexer, node, o);
            next = (Node) o.getObject();
            if (b) {
                continue;
            }

            b = blockStyle(lexer, node, o);
            next = (Node) o.getObject();
            if (b) {
                continue;
            }

            b = inlineStyle(lexer, node, o);
            next = (Node) o.getObject();
            if (b) {
                continue;
            }

            b = font2Span(lexer, node, o);
            next = (Node) o.getObject();
            if (b) {
                continue;
            }

            break;
        }

        return next;
    }

    private Node createStyleProperties(Lexer lexer, Node node) {
        Node child;

        if (node.content != null) {
            for (child = node.content; child != null; child = child.next) {
                child = createStyleProperties(lexer, child);
            }
        }

        return cleanNode(lexer, node);
    }

    private void defineStyleRules(Lexer lexer, Node node) {
        Node child;

        if (node.content != null) {
            for (child = node.content; child != null; child = child.next) {
                defineStyleRules(lexer, child);
            }
        }

        style2Rule(lexer, node);
    }

    public void cleanTree(Lexer lexer, Node doc) {
        doc = createStyleProperties(lexer, doc);

        if (!lexer.configuration.MakeClean) {
            defineStyleRules(lexer, doc);
            createStyleElement(lexer, doc);
        }
    }

    /* simplifies <b><b> ... </b> ...</b> etc. */
    public void nestedEmphasis(Node node) {
        MutableObject o = new MutableObject();
        Node next;

        while (node != null) {
            next = node.next;

            if ((node.tag == tt.tagB || node.tag == tt.tagI) && node.parent != null && node.parent.tag == node.tag) {
                /* strip redundant inner element */
                o.setObject(next);
                discardContainer(node, o);
                next = (Node) o.getObject();
                node = next;
                continue;
            }

            if (node.content != null) {
                nestedEmphasis(node.content);
            }

            node = next;
        }
    }

    /* replace i by em and b by strong */
    public void emFromI(Node node) {
        while (node != null) {
            if (node.tag == tt.tagI) {
                node.element = tt.tagEm.name;
                node.tag = tt.tagEm;
            } else if (node.tag == tt.tagB) {
                node.element = tt.tagStrong.name;
                node.tag = tt.tagStrong;
            }

            if (node.content != null) {
                emFromI(node.content);
            }

            node = node.next;
        }
    }

    /*
     * Some people use dir or ul without an li to indent the content. The
     * pattern to look for is a list with a single implicit li. This is
     * recursively replaced by an implicit blockquote.
     */
    public void list2BQ(Node node) {
        while (node != null) {
            if (node.content != null) {
                list2BQ(node.content);
            }

            if (node.tag != null && node.tag.parser == ParserImpl.getParseList() && node.hasOneChild()
                && node.content.implicit) {
                stripOnlyChild(node);
                node.element = tt.tagBlockquote.name;
                node.tag = tt.tagBlockquote;
                node.implicit = true;
            }

            node = node.next;
        }
    }

    /*
     * Replace implicit blockquote by div with an indent taking care to reduce
     * nested blockquotes to a single div with the indent set to match the
     * nesting depth
     */
    public void bQ2Div(Node node) {
        int indent;
        String indent_buf;

        while (node != null) {
            if (node.tag == tt.tagBlockquote && node.implicit) {
                indent = 1;

                while (node.hasOneChild() && node.content.tag == tt.tagBlockquote && node.implicit) {
                    ++indent;
                    stripOnlyChild(node);
                }

                if (node.content != null) {
                    bQ2Div(node.content);
                }

                indent_buf = "margin-left: " + new Integer(2 * indent).toString() + "em";

                node.element = tt.tagDiv.name;
                node.tag = tt.tagDiv;
                node.addAttribute("style", indent_buf);
            } else if (node.content != null) {
                bQ2Div(node.content);
            }

            node = node.next;
        }
    }

    /* node is <![if ...]> prune up to <![endif]> */
    public Node pruneSection(Lexer lexer, Node node) {
        for (; ; ) {
            /* discard node and returns next */
            node = Node.discardElement(node);

            if (node == null) {
                return null;
            }

            if (node.type == Node.SectionTag) {
                if (Lexer.getString(node.textarray, node.start, 2).equals("if")) {
                    node = pruneSection(lexer, node);
                    continue;
                }

                if (Lexer.getString(node.textarray, node.start, 5).equals("endif")) {
                    node = Node.discardElement(node);
                    break;
                }
            }
        }

        return node;
    }

    public void dropSections(Lexer lexer, Node node) {
        while (node != null) {
            if (node.type == Node.SectionTag) {
                /* prune up to matching endif */
                if (Lexer.getString(node.textarray, node.start, 2).equals("if")) {
                    node = pruneSection(lexer, node);
                    continue;
                }

                /* discard others as well */
                node = Node.discardElement(node);
                continue;
            }

            if (node.content != null) {
                dropSections(lexer, node.content);
            }

            node = node.next;
        }
    }

    public void purgeAttributes(Node node) {
        AttVal attr = node.attributes;
        AttVal next = null;
        AttVal prev = null;

        while (attr != null) {
            next = attr.next;

            /* special check for class="Code" denoting pre text */
            if (attr.attribute != null && attr.value != null && attr.attribute.equals("class")
                && attr.value.equals("Code")) {
                prev = attr;
            } else if (attr.attribute != null
                       && (attr.attribute.equals("class") || attr.attribute.equals("style")
                           || attr.attribute.equals("lang") || attr.attribute.startsWith("x:") || (attr.attribute
                                                                                                       .equals("height") || attr.attribute.equals("width"))
                                                                                                  && (node.tag == tt.tagTd || node.tag == tt.tagTr || node.tag == tt.tagTh))) {
                if (prev != null) {
                    prev.next = next;
                } else {
                    node.attributes = next;
                }
            } else {
                prev = attr;
            }

            attr = next;
        }
    }

    /* Word2000 uses span excessively, so we strip span out */
    public Node stripSpan(Lexer lexer, Node span) {
        Node node;
        Node prev = null;
        Node content;

        /*
         * deal with span elements that have content by splicing the content in
         * place of the span after having processed it
         */

        cleanWord2000(lexer, span.content);
        content = span.content;

        if (span.prev != null) {
            prev = span.prev;
        } else if (content != null) {
            node = content;
            content = content.next;
            Node.removeNode(node);
            Node.insertNodeBeforeElement(span, node);
            prev = node;
        }

        while (content != null) {
            node = content;
            content = content.next;
            Node.removeNode(node);
            Node.insertNodeAfterElement(prev, node);
            prev = node;
        }

        if (span.next == null) {
            span.parent.last = prev;
        }

        node = span.next;
        span.content = null;
        Node.discardElement(span);
        return node;
    }

    /* map non-breaking spaces to regular spaces */
    private void normalizeSpaces(Lexer lexer, Node node) {
        while (node != null) {
            if (node.content != null) {
                normalizeSpaces(lexer, node.content);
            }

            if (node.type == Node.TextNode) {
                int i;
                MutableInteger c = new MutableInteger();
                int p = node.start;

                for (i = node.start; i < node.end; ++i) {
                    c.value = node.textarray[i];

                    /* look for UTF-8 multibyte character */
                    if (c.value > 0x7F) {
                        i += PPrint.getUTF8(node.textarray, i, c);
                    }

                    if (c.value == 160) {
                        c.value = ' ';
                    }

                    p = PPrint.putUTF8(node.textarray, p, c.value);
                }
            }

            node = node.next;
        }
    }

    /*
     * This is a major clean up to strip out all the extra stuff you get when
     * you save as web page from Word 2000. It doesn't yet know what to do with
     * VML tags, but these will appear as errors unless you declare them as new
     * tags, such as o:p which needs to be declared as inline.
     */
    public void cleanWord2000(Lexer lexer, Node node) {
        /* used to a list from a sequence of bulletted p's */
        Node list = null;

        while (node != null) {
            /* discard Word's style verbiage */
            if (node.tag == tt.tagStyle || node.tag == tt.tagMeta || node.type == Node.CommentTag) {
                node = Node.discardElement(node);
                continue;
            }

            /* strip out all span tags Word scatters so liberally! */
            if (node.tag == tt.tagSpan) {
                node = stripSpan(lexer, node);
                continue;
            }

            /* get rid of Word's xmlns attributes */
            if (node.tag == tt.tagHtml) {
                /* check that it's a Word 2000 document */
                if (node.getAttrByName("xmlns:o") == null) {
                    return;
                }
            }

            if (node.tag == tt.tagLink) {
                AttVal attr = node.getAttrByName("rel");

                if (attr != null && attr.value != null && attr.value.equals("File-List")) {
                    node = Node.discardElement(node);
                    continue;
                }
            }

            /* discard empty paragraphs */
            if (node.content == null && node.tag == tt.tagP) {
                node = Node.discardElement(node);
                continue;
            }

            if (node.tag == tt.tagP) {
                AttVal attr = node.getAttrByName("class");

                /* map sequence of <p class="MsoListBullet"> to <ul>...</ul> */
                if (attr != null && attr.value != null && attr.value.equals("MsoListBullet")) {
                    Node.coerceNode(lexer, node, tt.tagLi);

                    if (list == null || list.tag != tt.tagUl) {
                        list = lexer.inferredTag("ul");
                        Node.insertNodeBeforeElement(node, list);
                    }

                    purgeAttributes(node);

                    if (node.content != null) {
                        cleanWord2000(lexer, node.content);
                    }

                    /* remove node and append to contents of list */
                    Node.removeNode(node);
                    Node.insertNodeAtEnd(list, node);
                    node = list.next;
                }
                /* map sequence of <p class="Code"> to <pre>...</pre> */
                else if (attr != null && attr.value != null && attr.value.equals("Code")) {
                    Node br = lexer.newLineNode();
                    normalizeSpaces(lexer, node);

                    if (list == null || list.tag != tt.tagPre) {
                        list = lexer.inferredTag("pre");
                        Node.insertNodeBeforeElement(node, list);
                    }

                    /* remove node and append to contents of list */
                    Node.removeNode(node);
                    Node.insertNodeAtEnd(list, node);
                    stripSpan(lexer, node);
                    Node.insertNodeAtEnd(list, br);
                    node = list.next;
                } else {
                    list = null;
                }
            } else {
                list = null;
            }

            /* strip out style and class attributes */
            if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
                purgeAttributes(node);
            }

            if (node.content != null) {
                cleanWord2000(lexer, node.content);
            }

            node = node.next;
        }
    }

    public boolean isWord2000(Node root, TagTable tt) {
        Node html = root.findHTML(tt);

        return html != null && html.getAttrByName("xmlns:o") != null;
    }
}