/*
* Copyright 2010 Alibaba Group Holding Limited.
* All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/*
* @(#)ParserImpl.java 1.11 2000/08/16
*
*/
package org.w3c.tidy;
/**
* HTML Parser implementation (c) 1998-2000 (W3C) MIT, INRIA, Keio University
* See Tidy.java for the copyright notice. Derived from <a
* href="http://www.w3.org/People/Raggett/tidy"> HTML Tidy Release 4 Aug
* 2000</a>
*
* @author Dave Raggett <dsr@w3.org>
* @author Andy Quick <ac.quick@sympatico.ca> (translation to Java)
* @version 1.11, 2000/08/16 Tidy Release 4 Aug 2000
*/
public class ParserImpl {
//private static int SeenBodyEndTag; /* AQ: moved into lexer structure */
private static void parseTag(Lexer lexer, Node node, short mode) {
// Local fix by GLP 2000-12-21. Need to reset insertspace if this
// is both a non-inline and empty tag (base, link, meta, isindex, hr, area).
// Remove this code once the fix is made in Tidy.
/******
* (Original code follows) if ((node.tag.model & Dict.CM_EMPTY) != 0) {
* lexer.waswhite = false; return; } else if (!((node.tag.model &
* Dict.CM_INLINE) != 0)) lexer.insertspace = false;
*******/
if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
lexer.insertspace = false;
}
if ((node.tag.model & Dict.CM_EMPTY) != 0) {
lexer.waswhite = false;
return;
}
if (node.tag.parser == null || node.type == Node.StartEndTag) {
return;
}
node.tag.parser.parse(lexer, node, mode);
}
private static void moveToHead(Lexer lexer, Node element, Node node) {
Node head;
TagTable tt = lexer.configuration.tt;
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
while (element.tag != tt.tagHtml) {
element = element.parent;
}
for (head = element.content; head != null; head = head.next) {
if (head.tag == tt.tagHead) {
Node.insertNodeAtEnd(head, node);
break;
}
}
if (node.tag.parser != null) {
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
} else {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
}
}
public static class ParseHTML implements Parser {
public void parse(Lexer lexer, Node html, short mode) {
Node node, head;
Node frameset = null;
Node noframes = null;
lexer.configuration.XmlTags = false;
lexer.seenBodyEndTag = 0;
TagTable tt = lexer.configuration.tt;
for (; ; ) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
node = lexer.inferredTag("head");
break;
}
if (node.tag == tt.tagHead) {
break;
}
if (node.tag == html.tag && node.type == Node.EndTag) {
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* deal with comments etc. */
if (Node.insertMisc(html, node)) {
continue;
}
lexer.ungetToken();
node = lexer.inferredTag("head");
break;
}
head = node;
Node.insertNodeAtEnd(html, head);
getParseHead().parse(lexer, head, mode);
for (; ; ) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
if (frameset == null) {
node = lexer.inferredTag("body");
}
return;
}
/* robustly handle html tags */
if (node.tag == html.tag) {
if (node.type != Node.StartTag && frameset == null) {
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
}
continue;
}
/* deal with comments etc. */
if (Node.insertMisc(html, node)) {
continue;
}
/* if frameset document coerce <body> to <noframes> */
if (node.tag == tt.tagBody) {
if (node.type != Node.StartTag) {
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset != null) {
lexer.ungetToken();
if (noframes == null) {
noframes = lexer.inferredTag("noframes");
Node.insertNodeAtEnd(frameset, noframes);
Report.warning(lexer, html, noframes, Report.INSERTING_TAG);
}
parseTag(lexer, noframes, mode);
continue;
}
break; /* to parse body */
}
/* flag an error if we see more than one frameset */
if (node.tag == tt.tagFrameset) {
if (node.type != Node.StartTag) {
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset != null) {
Report.error(lexer, html, node, Report.DUPLICATE_FRAMESET);
} else {
frameset = node;
}
Node.insertNodeAtEnd(html, node);
parseTag(lexer, node, mode);
/*
* see if it includes a noframes element so that we can
* merge subsequent noframes elements
*/
for (node = frameset.content; node != null; node = node.next) {
if (node.tag == tt.tagNoframes) {
noframes = node;
}
}
continue;
}
/* if not a frameset document coerce <noframes> to <body> */
if (node.tag == tt.tagNoframes) {
if (node.type != Node.StartTag) {
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (frameset == null) {
Report.warning(lexer, html, node, Report.DISCARDING_UNEXPECTED);
node = lexer.inferredTag("body");
break;
}
if (noframes == null) {
noframes = node;
Node.insertNodeAtEnd(frameset, noframes);
}
parseTag(lexer, noframes, mode);
continue;
}
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0) {
moveToHead(lexer, html, node);
continue;
}
}
lexer.ungetToken();
/* insert other content into noframes element */
if (frameset != null) {
if (noframes == null) {
noframes = lexer.inferredTag("noframes");
Node.insertNodeAtEnd(frameset, noframes);
} else {
Report.warning(lexer, html, node, Report.NOFRAMES_CONTENT);
}
parseTag(lexer, noframes, mode);
continue;
}
node = lexer.inferredTag("body");
break;
}
/* node must be body */
Node.insertNodeAtEnd(html, node);
parseTag(lexer, node, mode);
}
}
;
public static class ParseHead implements Parser {
public void parse(Lexer lexer, Node head, short mode) {
Node node;
int HasTitle = 0;
int HasBase = 0;
TagTable tt = lexer.configuration.tt;
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == head.tag && node.type == Node.EndTag) {
head.closed = true;
break;
}
if (node.type == Node.TextNode) {
lexer.ungetToken();
break;
}
/* deal with comments etc. */
if (Node.insertMisc(head, node)) {
continue;
}
if (node.type == Node.DocTypeTag) {
Node.insertDocType(lexer, head, node);
continue;
}
/* discard unknown tags */
if (node.tag == null) {
Report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (!((node.tag.model & Dict.CM_HEAD) != 0)) {
lexer.ungetToken();
break;
}
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
if (node.tag == tt.tagTitle) {
++HasTitle;
if (HasTitle > 1) {
Report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
}
} else if (node.tag == tt.tagBase) {
++HasBase;
if (HasBase > 1) {
Report.warning(lexer, head, node, Report.TOO_MANY_ELEMENTS);
}
} else if (node.tag == tt.tagNoscript) {
Report.warning(lexer, head, node, Report.TAG_NOT_ALLOWED_IN);
}
Node.insertNodeAtEnd(head, node);
parseTag(lexer, node, Lexer.IgnoreWhitespace);
continue;
}
/* discard unexpected text nodes and end tags */
Report.warning(lexer, head, node, Report.DISCARDING_UNEXPECTED);
}
if (HasTitle == 0) {
Report.warning(lexer, head, null, Report.MISSING_TITLE_ELEMENT);
Node.insertNodeAtEnd(head, lexer.inferredTag("title"));
}
}
}
;
public static class ParseTitle implements Parser {
public void parse(Lexer lexer, Node title, short mode) {
Node node;
while (true) {
node = lexer.getToken(Lexer.MixedContent);
if (node == null) {
break;
}
if (node.tag == title.tag && node.type == Node.EndTag) {
title.closed = true;
Node.trimSpaces(lexer, title);
return;
}
if (node.type == Node.TextNode) {
/* only called for 1st child */
if (title.content == null) {
Node.trimInitialSpace(lexer, title, node);
}
if (node.start >= node.end) {
continue;
}
Node.insertNodeAtEnd(title, node);
continue;
}
/* deal with comments etc. */
if (Node.insertMisc(title, node)) {
continue;
}
/* discard unknown tags */
if (node.tag == null) {
Report.warning(lexer, title, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* pushback unexpected tokens */
Report.warning(lexer, title, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
Node.trimSpaces(lexer, title);
return;
}
Report.warning(lexer, title, node, Report.MISSING_ENDTAG_FOR);
}
}
;
public static class ParseScript implements Parser {
public void parse(Lexer lexer, Node script, short mode) {
/*
* This isn't quite right for CDATA content as it recognises tags
* within the content and parses them accordingly. This will
* unfortunately screw up scripts which include < + letter, < + !, <
* + ? or < + / + letter
*/
Node node;
node = lexer.getCDATA(script);
if (node != null) {
Node.insertNodeAtEnd(script, node);
}
}
}
;
public static class ParseBody implements Parser {
public void parse(Lexer lexer, Node body, short mode) {
Node node;
boolean checkstack, iswhitenode;
mode = Lexer.IgnoreWhitespace;
checkstack = true;
TagTable tt = lexer.configuration.tt;
while (true) {
node = lexer.getToken(mode);
if (node == null) {
break;
}
if (node.tag == body.tag && node.type == Node.EndTag) {
body.closed = true;
Node.trimSpaces(lexer, body);
lexer.seenBodyEndTag = 1;
mode = Lexer.IgnoreWhitespace;
if (body.parent.tag == tt.tagNoframes) {
break;
}
continue;
}
if (node.tag == tt.tagNoframes) {
if (node.type == Node.StartTag) {
Node.insertNodeAtEnd(body, node);
getParseBlock().parse(lexer, node, mode);
continue;
}
if (node.type == Node.EndTag && body.parent.tag == tt.tagNoframes) {
Node.trimSpaces(lexer, body);
lexer.ungetToken();
break;
}
}
if ((node.tag == tt.tagFrame || node.tag == tt.tagFrameset) && body.parent.tag == tt.tagNoframes) {
Node.trimSpaces(lexer, body);
lexer.ungetToken();
break;
}
if (node.tag == tt.tagHtml) {
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
Report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
}
continue;
}
iswhitenode = false;
if (node.type == Node.TextNode && node.end <= node.start + 1
&& node.textarray[node.start] == (byte) ' ') {
iswhitenode = true;
}
/* deal with comments etc. */
if (Node.insertMisc(body, node)) {
continue;
}
if (lexer.seenBodyEndTag == 1 && !iswhitenode) {
++lexer.seenBodyEndTag;
Report.warning(lexer, body, node, Report.CONTENT_AFTER_BODY);
}
/* mixed content model permits text */
if (node.type == Node.TextNode) {
if (iswhitenode && mode == Lexer.IgnoreWhitespace) {
continue;
}
if (lexer.configuration.EncloseBodyText && !iswhitenode) {
Node para;
lexer.ungetToken();
para = lexer.inferredTag("p");
Node.insertNodeAtEnd(body, para);
parseTag(lexer, para, mode);
mode = Lexer.MixedContent;
continue;
} else {
/* strict doesn't allow text here */
lexer.versions &= ~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20);
}
if (checkstack) {
checkstack = false;
if (lexer.inlineDup(node) > 0) {
continue;
}
}
Node.insertNodeAtEnd(body, node);
mode = Lexer.MixedContent;
continue;
}
if (node.type == Node.DocTypeTag) {
Node.insertDocType(lexer, body, node);
continue;
}
/* discard unknown and PARAM tags */
if (node.tag == null || node.tag == tt.tagParam) {
Report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/*
* Netscape allows LI and DD directly in BODY We infer UL or DL
* respectively and use this boolean to exclude block-level
* elements so as to match Netscape's observed behaviour.
*/
lexer.excludeBlocks = false;
if (!((node.tag.model & Dict.CM_BLOCK) != 0) && !((node.tag.model & Dict.CM_INLINE) != 0)) {
/* avoid this error message being issued twice */
if (!((node.tag.model & Dict.CM_HEAD) != 0)) {
Report.warning(lexer, body, node, Report.TAG_NOT_ALLOWED_IN);
}
if ((node.tag.model & Dict.CM_HTML) != 0) {
/* copy body attributes if current body was inferred */
if (node.tag == tt.tagBody && body.implicit && body.attributes == null) {
body.attributes = node.attributes;
node.attributes = null;
}
continue;
}
if ((node.tag.model & Dict.CM_HEAD) != 0) {
moveToHead(lexer, body, node);
continue;
}
if ((node.tag.model & Dict.CM_LIST) != 0) {
lexer.ungetToken();
node = lexer.inferredTag("ul");
Node.addClass(node, "noindent");
lexer.excludeBlocks = true;
} else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
lexer.ungetToken();
node = lexer.inferredTag("dl");
lexer.excludeBlocks = true;
} else if ((node.tag.model & (Dict.CM_TABLE | Dict.CM_ROWGRP | Dict.CM_ROW)) != 0) {
lexer.ungetToken();
node = lexer.inferredTag("table");
lexer.excludeBlocks = true;
} else {
/*
* AQ: The following line is from the official C version
* of tidy. It doesn't make sense to me because the '!'
* operator has higher precedence than the '&' operator.
* It seems to me that the expression always evaluates
* to 0. if (!node->tag->model & (CM_ROW | CM_FIELD))
* AQ: 13Jan2000 fixed in C tidy
*/
if (!((node.tag.model & (Dict.CM_ROW | Dict.CM_FIELD)) != 0)) {
lexer.ungetToken();
return;
}
/* ignore </td> </th> <option> etc. */
continue;
}
}
if (node.type == Node.EndTag) {
if (node.tag == tt.tagBr) {
node.type = Node.StartTag;
} else if (node.tag == tt.tagP) {
Node.coerceNode(lexer, node, tt.tagBr);
Node.insertNodeAtEnd(body, node);
node = lexer.inferredTag("br");
} else if ((node.tag.model & Dict.CM_INLINE) != 0) {
lexer.popInline(node);
}
}
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
if ((node.tag.model & Dict.CM_INLINE) != 0 && !((node.tag.model & Dict.CM_MIXED) != 0)) {
/* HTML4 strict doesn't allow inline content here */
/* but HTML2 does allow img elements as children of body */
if (node.tag == tt.tagImg) {
lexer.versions &= ~Dict.VERS_HTML40_STRICT;
} else {
lexer.versions &= ~(Dict.VERS_HTML40_STRICT | Dict.VERS_HTML20);
}
if (checkstack && !node.implicit) {
checkstack = false;
if (lexer.inlineDup(node) > 0) {
continue;
}
}
mode = Lexer.MixedContent;
} else {
checkstack = true;
mode = Lexer.IgnoreWhitespace;
}
if (node.implicit) {
Report.warning(lexer, body, node, Report.INSERTING_TAG);
}
Node.insertNodeAtEnd(body, node);
parseTag(lexer, node, mode);
continue;
}
/* discard unexpected tags */
Report.warning(lexer, body, node, Report.DISCARDING_UNEXPECTED);
}
}
}
;
public static class ParseFrameSet implements Parser {
public void parse(Lexer lexer, Node frameset, short mode) {
Node node;
TagTable tt = lexer.configuration.tt;
lexer.badAccess |= Report.USING_FRAMES;
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == frameset.tag && node.type == Node.EndTag) {
frameset.closed = true;
Node.trimSpaces(lexer, frameset);
return;
}
/* deal with comments etc. */
if (Node.insertMisc(frameset, node)) {
continue;
}
if (node.tag == null) {
Report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
if (node.tag != null && (node.tag.model & Dict.CM_HEAD) != 0) {
moveToHead(lexer, frameset, node);
continue;
}
}
if (node.tag == tt.tagBody) {
lexer.ungetToken();
node = lexer.inferredTag("noframes");
Report.warning(lexer, frameset, node, Report.INSERTING_TAG);
}
if (node.type == Node.StartTag && (node.tag.model & Dict.CM_FRAMES) != 0) {
Node.insertNodeAtEnd(frameset, node);
lexer.excludeBlocks = false;
parseTag(lexer, node, Lexer.MixedContent);
continue;
} else if (node.type == Node.StartEndTag && (node.tag.model & Dict.CM_FRAMES) != 0) {
Node.insertNodeAtEnd(frameset, node);
continue;
}
/* discard unexpected tags */
Report.warning(lexer, frameset, node, Report.DISCARDING_UNEXPECTED);
}
Report.warning(lexer, frameset, node, Report.MISSING_ENDTAG_FOR);
}
}
;
public static class ParseInline implements Parser {
public void parse(Lexer lexer, Node element, short mode) {
Node node, parent;
TagTable tt = lexer.configuration.tt;
if ((element.tag.model & Dict.CM_EMPTY) != 0) {
return;
}
if (element.tag == tt.tagA) {
if (element.attributes == null) {
Report.warning(lexer, element.parent, element, Report.DISCARDING_UNEXPECTED);
Node.discardElement(element);
return;
}
}
/*
* ParseInline is used for some block level elements like H1 to H6
* For such elements we need to insert inline emphasis tags
* currently on the inline stack. For Inline elements, we normally
* push them onto the inline stack provided they aren't implicit or
* OBJECT/APPLET. This test is carried out in PushInline and
* PopInline, see istack.c We don't push A or SPAN to replicate
* current browser behavior
*/
if ((element.tag.model & Dict.CM_BLOCK) != 0 || element.tag == tt.tagDt) {
lexer.inlineDup(null);
} else if ((element.tag.model & Dict.CM_INLINE) != 0 && element.tag != tt.tagA && element.tag != tt.tagSpan) {
lexer.pushInline(element);
}
if (element.tag == tt.tagNobr) {
lexer.badLayout |= Report.USING_NOBR;
} else if (element.tag == tt.tagFont) {
lexer.badLayout |= Report.USING_FONT;
}
/* Inline elements may or may not be within a preformatted element */
if (mode != Lexer.Preformatted) {
mode = Lexer.MixedContent;
}
while (true) {
node = lexer.getToken(mode);
if (node == null) {
break;
}
/* end tag for current element */
if (node.tag == element.tag && node.type == Node.EndTag) {
if ((element.tag.model & Dict.CM_INLINE) != 0 && element.tag != tt.tagA) {
lexer.popInline(node);
}
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
/*
* if a font element wraps an anchor and nothing else then
* move the font element inside the anchor since otherwise
* it won't alter the anchor text color
*/
if (element.tag == tt.tagFont && element.content != null && element.content == element.last) {
Node child = element.content;
if (child.tag == tt.tagA) {
child.parent = element.parent;
child.next = element.next;
child.prev = element.prev;
if (child.prev != null) {
child.prev.next = child;
} else {
child.parent.content = child;
}
if (child.next != null) {
child.next.prev = child;
} else {
child.parent.last = child;
}
element.next = null;
element.prev = null;
element.parent = child;
element.content = child.content;
element.last = child.last;
child.content = element;
child.last = element;
for (child = element.content; child != null; child = child.next) {
child.parent = element;
}
}
}
element.closed = true;
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
/* <u>...<u> map 2nd <u> to </u> if 1st is explicit */
/* otherwise emphasis nesting is probably unintentional */
/* big and small have cumulative effect to leave them alone */
if (node.type == Node.StartTag && node.tag == element.tag && lexer.isPushed(node) && !node.implicit
&& !element.implicit && node.tag != null && (node.tag.model & Dict.CM_INLINE) != 0
&& node.tag != tt.tagA && node.tag != tt.tagFont && node.tag != tt.tagBig
&& node.tag != tt.tagSmall) {
if (element.content != null && node.attributes == null) {
Report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG);
node.type = Node.EndTag;
lexer.ungetToken();
continue;
}
Report.warning(lexer, element, node, Report.NESTED_EMPHASIS);
}
if (node.type == Node.TextNode) {
/* only called for 1st child */
if (element.content == null && !((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
if (node.start >= node.end) {
continue;
}
Node.insertNodeAtEnd(element, node);
continue;
}
/* mixed content model so allow text */
if (Node.insertMisc(element, node)) {
continue;
}
/* deal with HTML tags */
if (node.tag == tt.tagHtml) {
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* otherwise infer end of inline element */
lexer.ungetToken();
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
Node.trimEmptyElement(lexer, element);
return;
}
/* within <dt> or <pre> map <p> to <br> */
if (node.tag == tt.tagP
&& node.type == Node.StartTag
&& ((mode & Lexer.Preformatted) != 0 || element.tag == tt.tagDt || element
.isDescendantOf(tt.tagDt))) {
node.tag = tt.tagBr;
node.element = "br";
Node.trimSpaces(lexer, element);
Node.insertNodeAtEnd(element, node);
continue;
}
/* ignore unknown and PARAM tags */
if (node.tag == null || node.tag == tt.tagParam) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.tag == tt.tagBr && node.type == Node.EndTag) {
node.type = Node.StartTag;
}
if (node.type == Node.EndTag) {
/* coerce </br> to <br> */
if (node.tag == tt.tagBr) {
node.type = Node.StartTag;
} else if (node.tag == tt.tagP) {
/* coerce unmatched </p> to <br><br> */
if (!element.isDescendantOf(tt.tagP)) {
Node.coerceNode(lexer, node, tt.tagBr);
Node.trimSpaces(lexer, element);
Node.insertNodeAtEnd(element, node);
node = lexer.inferredTag("br");
continue;
}
} else if ((node.tag.model & Dict.CM_INLINE) != 0 && node.tag != tt.tagA
&& !((node.tag.model & Dict.CM_OBJECT) != 0) && (element.tag.model & Dict.CM_INLINE) != 0) {
/* allow any inline end tag to end current element */
lexer.popInline(element);
if (element.tag != tt.tagA) {
if (node.tag == tt.tagA && node.tag != element.tag) {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
} else {
Report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG);
}
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
Node.trimEmptyElement(lexer, element);
return;
}
/*
* if parent is <a> then discard unexpected inline end
* tag
*/
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
} /*
* special case </tr> etc. for stuff moved in front of
* table
*/ else if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0) {
lexer.ungetToken();
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
}
/* allow any header tag to end current header */
if ((node.tag.model & Dict.CM_HEADING) != 0 && (element.tag.model & Dict.CM_HEADING) != 0) {
if (node.tag == element.tag) {
Report.warning(lexer, element, node, Report.NON_MATCHING_ENDTAG);
} else {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
}
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
Node.trimEmptyElement(lexer, element);
return;
}
/*
* an <A> tag to ends any open <A> element but <A href=...> is
* mapped to </A><A href=...>
*/
if (node.tag == tt.tagA && !node.implicit && lexer.isPushed(node)) {
/* coerce <a> to </a> unless it has some attributes */
if (node.attributes == null) {
node.type = Node.EndTag;
Report.warning(lexer, element, node, Report.COERCE_TO_ENDTAG);
lexer.popInline(node);
lexer.ungetToken();
continue;
}
lexer.ungetToken();
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
lexer.popInline(element);
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
Node.trimEmptyElement(lexer, element);
return;
}
if ((element.tag.model & Dict.CM_HEADING) != 0) {
if (node.tag == tt.tagCenter || node.tag == tt.tagDiv) {
if (node.type != Node.StartTag && node.type != Node.StartEndTag) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
/* insert center as parent if heading is empty */
if (element.content == null) {
Node.insertNodeAsParent(element, node);
continue;
}
/* split heading and make center parent of 2nd part */
Node.insertNodeAfterElement(element, node);
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
element = lexer.cloneNode(element);
element.start = lexer.lexsize;
element.end = lexer.lexsize;
Node.insertNodeAtEnd(node, element);
continue;
}
if (node.tag == tt.tagHr) {
if (node.type != Node.StartTag && node.type != Node.StartEndTag) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
/* insert hr before heading if heading is empty */
if (element.content == null) {
Node.insertNodeBeforeElement(element, node);
continue;
}
/* split heading and insert hr before 2nd part */
Node.insertNodeAfterElement(element, node);
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
element = lexer.cloneNode(element);
element.start = lexer.lexsize;
element.end = lexer.lexsize;
Node.insertNodeAfterElement(node, element);
continue;
}
}
if (element.tag == tt.tagDt) {
if (node.tag == tt.tagHr) {
Node dd;
if (node.type != Node.StartTag && node.type != Node.StartEndTag) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
Report.warning(lexer, element, node, Report.TAG_NOT_ALLOWED_IN);
dd = lexer.inferredTag("dd");
/* insert hr within dd before dt if dt is empty */
if (element.content == null) {
Node.insertNodeBeforeElement(element, dd);
Node.insertNodeAtEnd(dd, node);
continue;
}
/* split dt and insert hr within dd before 2nd part */
Node.insertNodeAfterElement(element, dd);
Node.insertNodeAtEnd(dd, node);
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
element = lexer.cloneNode(element);
element.start = lexer.lexsize;
element.end = lexer.lexsize;
Node.insertNodeAfterElement(dd, element);
continue;
}
}
/*
* if this is the end tag for an ancestor element then infer end
* tag for this element
*/
if (node.type == Node.EndTag) {
for (parent = element.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit) {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
}
if (element.tag == tt.tagA) {
lexer.popInline(element);
}
lexer.ungetToken();
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
Node.trimEmptyElement(lexer, element);
return;
}
}
}
/* block level tags end this element */
if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
if (node.type != Node.StartTag) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (!((element.tag.model & Dict.CM_OPT) != 0)) {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
}
if ((node.tag.model & Dict.CM_HEAD) != 0 && !((node.tag.model & Dict.CM_BLOCK) != 0)) {
moveToHead(lexer, element, node);
continue;
}
/*
* prevent anchors from propagating into block tags except
* for headings h1 to h6
*/
if (element.tag == tt.tagA) {
if (node.tag != null && !((node.tag.model & Dict.CM_HEADING) != 0)) {
lexer.popInline(element);
} else if (!(element.content != null)) {
Node.discardElement(element);
lexer.ungetToken();
return;
}
}
lexer.ungetToken();
if (!((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, element);
}
Node.trimEmptyElement(lexer, element);
return;
}
/* parse inline element */
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
if (node.implicit) {
Report.warning(lexer, element, node, Report.INSERTING_TAG);
}
/* trim white space before <br> */
if (node.tag == tt.tagBr) {
Node.trimSpaces(lexer, element);
}
Node.insertNodeAtEnd(element, node);
parseTag(lexer, node, mode);
continue;
}
/* discard unexpected tags */
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
}
if (!((element.tag.model & Dict.CM_OPT) != 0)) {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR);
}
Node.trimEmptyElement(lexer, element);
}
}
;
public static class ParseList implements Parser {
public void parse(Lexer lexer, Node list, short mode) {
Node node;
Node parent;
TagTable tt = lexer.configuration.tt;
if ((list.tag.model & Dict.CM_EMPTY) != 0) {
return;
}
lexer.insert = -1; /* defer implicit inline start tags */
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == list.tag && node.type == Node.EndTag) {
if ((list.tag.model & Dict.CM_OBSOLETE) != 0) {
Node.coerceNode(lexer, list, tt.tagUl);
}
list.closed = true;
Node.trimEmptyElement(lexer, list);
return;
}
/* deal with comments etc. */
if (Node.insertMisc(list, node)) {
continue;
}
if (node.type != Node.TextNode && node.tag == null) {
Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/*
* if this is the end tag for an ancestor element then infer end
* tag for this element
*/
if (node.type == Node.EndTag) {
if (node.tag == tt.tagForm) {
lexer.badForm = 1;
Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.tag != null && (node.tag.model & Dict.CM_INLINE) != 0) {
Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
lexer.popInline(node);
continue;
}
for (parent = list.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
Report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
if ((list.tag.model & Dict.CM_OBSOLETE) != 0) {
Node.coerceNode(lexer, list, tt.tagUl);
}
Node.trimEmptyElement(lexer, list);
return;
}
}
Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.tag != tt.tagLi) {
lexer.ungetToken();
if (node.tag != null && (node.tag.model & Dict.CM_BLOCK) != 0 && lexer.excludeBlocks) {
Report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
Node.trimEmptyElement(lexer, list);
return;
}
node = lexer.inferredTag("li");
node.addAttribute("style", "list-style: none");
Report.warning(lexer, list, node, Report.MISSING_STARTTAG);
}
/* node should be <LI> */
Node.insertNodeAtEnd(list, node);
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
if ((list.tag.model & Dict.CM_OBSOLETE) != 0) {
Node.coerceNode(lexer, list, tt.tagUl);
}
Report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR);
Node.trimEmptyElement(lexer, list);
}
}
;
public static class ParseDefList implements Parser {
public void parse(Lexer lexer, Node list, short mode) {
Node node, parent;
TagTable tt = lexer.configuration.tt;
if ((list.tag.model & Dict.CM_EMPTY) != 0) {
return;
}
lexer.insert = -1; /* defer implicit inline start tags */
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == list.tag && node.type == Node.EndTag) {
list.closed = true;
Node.trimEmptyElement(lexer, list);
return;
}
/* deal with comments etc. */
if (Node.insertMisc(list, node)) {
continue;
}
if (node.type == Node.TextNode) {
lexer.ungetToken();
node = lexer.inferredTag("dt");
Report.warning(lexer, list, node, Report.MISSING_STARTTAG);
}
if (node.tag == null) {
Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/*
* if this is the end tag for an ancestor element then infer end
* tag for this element
*/
if (node.type == Node.EndTag) {
if (node.tag == tt.tagForm) {
lexer.badForm = 1;
Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
continue;
}
for (parent = list.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
Report.warning(lexer, list, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
Node.trimEmptyElement(lexer, list);
return;
}
}
}
/* center in a dt or a dl breaks the dl list in two */
if (node.tag == tt.tagCenter) {
if (list.content != null) {
Node.insertNodeAfterElement(list, node);
} else /* trim empty dl list */ {
Node.insertNodeBeforeElement(list, node);
Node.discardElement(list);
}
/* and parse contents of center */
parseTag(lexer, node, mode);
/* now create a new dl element */
list = lexer.inferredTag("dl");
Node.insertNodeAfterElement(node, list);
continue;
}
if (!(node.tag == tt.tagDt || node.tag == tt.tagDd)) {
lexer.ungetToken();
if (!((node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0)) {
Report.warning(lexer, list, node, Report.TAG_NOT_ALLOWED_IN);
Node.trimEmptyElement(lexer, list);
return;
}
/* if DD appeared directly in BODY then exclude blocks */
if (!((node.tag.model & Dict.CM_INLINE) != 0) && lexer.excludeBlocks) {
Node.trimEmptyElement(lexer, list);
return;
}
node = lexer.inferredTag("dd");
Report.warning(lexer, list, node, Report.MISSING_STARTTAG);
}
if (node.type == Node.EndTag) {
Report.warning(lexer, list, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* node should be <DT> or <DD> */
Node.insertNodeAtEnd(list, node);
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
Report.warning(lexer, list, node, Report.MISSING_ENDTAG_FOR);
Node.trimEmptyElement(lexer, list);
}
}
;
public static class ParsePre implements Parser {
public void parse(Lexer lexer, Node pre, short mode) {
Node node, parent;
TagTable tt = lexer.configuration.tt;
if ((pre.tag.model & Dict.CM_EMPTY) != 0) {
return;
}
if ((pre.tag.model & Dict.CM_OBSOLETE) != 0) {
Node.coerceNode(lexer, pre, tt.tagPre);
}
lexer.inlineDup(null); /* tell lexer to insert inlines if needed */
while (true) {
node = lexer.getToken(Lexer.Preformatted);
if (node == null) {
break;
}
if (node.tag == pre.tag && node.type == Node.EndTag) {
Node.trimSpaces(lexer, pre);
pre.closed = true;
Node.trimEmptyElement(lexer, pre);
return;
}
if (node.tag == tt.tagHtml) {
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
}
continue;
}
if (node.type == Node.TextNode) {
/* if first check for inital newline */
if (pre.content == null) {
if (node.textarray[node.start] == (byte) '\n') {
++node.start;
}
if (node.start >= node.end) {
continue;
}
}
Node.insertNodeAtEnd(pre, node);
continue;
}
/* deal with comments etc. */
if (Node.insertMisc(pre, node)) {
continue;
}
/* discard unknown and PARAM tags */
if (node.tag == null || node.tag == tt.tagParam) {
Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.tag == tt.tagP) {
if (node.type == Node.StartTag) {
Report.warning(lexer, pre, node, Report.USING_BR_INPLACE_OF);
/* trim white space before <p> in <pre> */
Node.trimSpaces(lexer, pre);
/* coerce both <p> and </p> to <br> */
Node.coerceNode(lexer, node, tt.tagBr);
Node.insertNodeAtEnd(pre, node);
} else {
Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
}
continue;
}
if ((node.tag.model & Dict.CM_HEAD) != 0 && !((node.tag.model & Dict.CM_BLOCK) != 0)) {
moveToHead(lexer, pre, node);
continue;
}
/*
* if this is the end tag for an ancestor element then infer end
* tag for this element
*/
if (node.type == Node.EndTag) {
if (node.tag == tt.tagForm) {
lexer.badForm = 1;
Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
continue;
}
for (parent = pre.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
Node.trimSpaces(lexer, pre);
Node.trimEmptyElement(lexer, pre);
return;
}
}
}
/* what about head content, HEAD, BODY tags etc? */
if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
if (node.type != Node.StartTag) {
Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
continue;
}
Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_BEFORE);
lexer.excludeBlocks = true;
/* check if we need to infer a container */
if ((node.tag.model & Dict.CM_LIST) != 0) {
lexer.ungetToken();
node = lexer.inferredTag("ul");
Node.addClass(node, "noindent");
} else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
lexer.ungetToken();
node = lexer.inferredTag("dl");
} else if ((node.tag.model & Dict.CM_TABLE) != 0) {
lexer.ungetToken();
node = lexer.inferredTag("table");
}
Node.insertNodeAfterElement(pre, node);
pre = lexer.inferredTag("pre");
Node.insertNodeAfterElement(node, pre);
parseTag(lexer, node, Lexer.IgnoreWhitespace);
lexer.excludeBlocks = false;
continue;
}
/*
* if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
* Report.warning(lexer, pre, node,
* Report.MISSING_ENDTAG_BEFORE); lexer.ungetToken(); return; }
*/
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
/* trim white space before <br> */
if (node.tag == tt.tagBr) {
Node.trimSpaces(lexer, pre);
}
Node.insertNodeAtEnd(pre, node);
parseTag(lexer, node, Lexer.Preformatted);
continue;
}
/* discard unexpected tags */
Report.warning(lexer, pre, node, Report.DISCARDING_UNEXPECTED);
}
Report.warning(lexer, pre, node, Report.MISSING_ENDTAG_FOR);
Node.trimEmptyElement(lexer, pre);
}
}
;
public static class ParseBlock implements Parser {
public void parse(Lexer lexer, Node element, short mode)
/*
* element is node created by the lexer upon seeing the start tag, or by
* the parser when the start tag is inferred
*/ {
Node node, parent;
boolean checkstack;
int istackbase = 0;
TagTable tt = lexer.configuration.tt;
checkstack = true;
if ((element.tag.model & Dict.CM_EMPTY) != 0) {
return;
}
if (element.tag == tt.tagForm && element.isDescendantOf(tt.tagForm)) {
Report.warning(lexer, element, null, Report.ILLEGAL_NESTING);
}
/*
* InlineDup() asks the lexer to insert inline emphasis tags
* currently pushed on the istack, but take care to avoid
* propagating inline emphasis inside OBJECT or APPLET. For these
* elements a fresh inline stack context is created and disposed of
* upon reaching the end of the element. They thus behave like table
* cells in this respect.
*/
if ((element.tag.model & Dict.CM_OBJECT) != 0) {
istackbase = lexer.istackbase;
lexer.istackbase = lexer.istack.size();
}
if (!((element.tag.model & Dict.CM_MIXED) != 0)) {
lexer.inlineDup(null);
}
mode = Lexer.IgnoreWhitespace;
while (true) {
node = lexer.getToken(mode /* Lexer.MixedContent */);
if (node == null) {
break;
}
/* end tag for this element */
if (node.type == Node.EndTag && node.tag != null
&& (node.tag == element.tag || element.was == node.tag)) {
if ((element.tag.model & Dict.CM_OBJECT) != 0) {
/* pop inline stack */
while (lexer.istack.size() > lexer.istackbase) {
lexer.popInline(null);
}
lexer.istackbase = istackbase;
}
element.closed = true;
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
if (node.tag == tt.tagHtml || node.tag == tt.tagHead || node.tag == tt.tagBody) {
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
}
continue;
}
if (node.type == Node.EndTag) {
if (node.tag == null) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
} else if (node.tag == tt.tagBr) {
node.type = Node.StartTag;
} else if (node.tag == tt.tagP) {
Node.coerceNode(lexer, node, tt.tagBr);
Node.insertNodeAtEnd(element, node);
node = lexer.inferredTag("br");
} else {
/*
* if this is the end tag for an ancestor element then
* infer end tag for this element
*/
for (parent = element.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
if (!((element.tag.model & Dict.CM_OPT) != 0)) {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
}
lexer.ungetToken();
if ((element.tag.model & Dict.CM_OBJECT) != 0) {
/* pop inline stack */
while (lexer.istack.size() > lexer.istackbase) {
lexer.popInline(null);
}
lexer.istackbase = istackbase;
}
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
}
/*
* special case </tr> etc. for stuff moved in front of
* table
*/
if (lexer.exiled && node.tag.model != 0 && (node.tag.model & Dict.CM_TABLE) != 0) {
lexer.ungetToken();
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
}
}
/* mixed content model permits text */
if (node.type == Node.TextNode) {
boolean iswhitenode = false;
if (node.type == Node.TextNode && node.end <= node.start + 1
&& lexer.lexbuf[node.start] == (byte) ' ') {
iswhitenode = true;
}
if (lexer.configuration.EncloseBlockText && !iswhitenode) {
lexer.ungetToken();
node = lexer.inferredTag("p");
Node.insertNodeAtEnd(element, node);
parseTag(lexer, node, Lexer.MixedContent);
continue;
}
if (checkstack) {
checkstack = false;
if (!((element.tag.model & Dict.CM_MIXED) != 0)) {
if (lexer.inlineDup(node) > 0) {
continue;
}
}
}
Node.insertNodeAtEnd(element, node);
mode = Lexer.MixedContent;
/*
* HTML4 strict doesn't allow mixed content for elements
* with %block; as their content model
*/
lexer.versions &= ~Dict.VERS_HTML40_STRICT;
continue;
}
if (Node.insertMisc(element, node)) {
continue;
}
/* allow PARAM elements? */
if (node.tag == tt.tagParam) {
if ((element.tag.model & Dict.CM_PARAM) != 0
&& (node.type == Node.StartTag || node.type == Node.StartEndTag)) {
Node.insertNodeAtEnd(element, node);
continue;
}
/* otherwise discard it */
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* allow AREA elements? */
if (node.tag == tt.tagArea) {
if (element.tag == tt.tagMap && (node.type == Node.StartTag || node.type == Node.StartEndTag)) {
Node.insertNodeAtEnd(element, node);
continue;
}
/* otherwise discard it */
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* ignore unknown start/end tags */
if (node.tag == null) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/*
* Allow Dict.CM_INLINE elements here. Allow Dict.CM_BLOCK
* elements here unless lexer.excludeBlocks is yes. LI and DD
* are special cased. Otherwise infer end tag for this element.
*/
if (!((node.tag.model & Dict.CM_INLINE) != 0)) {
if (node.type != Node.StartTag && node.type != Node.StartEndTag) {
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (element.tag == tt.tagTd || element.tag == tt.tagTh) {
/*
* if parent is a table cell, avoid inferring the end of
* the cell
*/
if ((node.tag.model & Dict.CM_HEAD) != 0) {
moveToHead(lexer, element, node);
continue;
}
if ((node.tag.model & Dict.CM_LIST) != 0) {
lexer.ungetToken();
node = lexer.inferredTag("ul");
Node.addClass(node, "noindent");
lexer.excludeBlocks = true;
} else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
lexer.ungetToken();
node = lexer.inferredTag("dl");
lexer.excludeBlocks = true;
}
/* infer end of current table cell */
if (!((node.tag.model & Dict.CM_BLOCK) != 0)) {
lexer.ungetToken();
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
} else if ((node.tag.model & Dict.CM_BLOCK) != 0) {
if (lexer.excludeBlocks) {
if (!((element.tag.model & Dict.CM_OPT) != 0)) {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
}
lexer.ungetToken();
if ((element.tag.model & Dict.CM_OBJECT) != 0) {
lexer.istackbase = istackbase;
}
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
} else /* things like list items */ {
if (!((element.tag.model & Dict.CM_OPT) != 0) && !element.implicit) {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_BEFORE);
}
if ((node.tag.model & Dict.CM_HEAD) != 0) {
moveToHead(lexer, element, node);
continue;
}
lexer.ungetToken();
if ((node.tag.model & Dict.CM_LIST) != 0) {
if (element.parent != null && element.parent.tag != null
&& element.parent.tag.parser == getParseList()) {
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
node = lexer.inferredTag("ul");
Node.addClass(node, "noindent");
} else if ((node.tag.model & Dict.CM_DEFLIST) != 0) {
if (element.parent.tag == tt.tagDl) {
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
node = lexer.inferredTag("dl");
} else if ((node.tag.model & Dict.CM_TABLE) != 0 || (node.tag.model & Dict.CM_ROW) != 0) {
node = lexer.inferredTag("table");
} else if ((element.tag.model & Dict.CM_OBJECT) != 0) {
/* pop inline stack */
while (lexer.istack.size() > lexer.istackbase) {
lexer.popInline(null);
}
lexer.istackbase = istackbase;
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
} else {
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
return;
}
}
}
/* parse known element */
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
if ((node.tag.model & Dict.CM_INLINE) != 0) {
if (checkstack && !node.implicit) {
checkstack = false;
if (lexer.inlineDup(node) > 0) {
continue;
}
}
mode = Lexer.MixedContent;
} else {
checkstack = true;
mode = Lexer.IgnoreWhitespace;
}
/* trim white space before <br> */
if (node.tag == tt.tagBr) {
Node.trimSpaces(lexer, element);
}
Node.insertNodeAtEnd(element, node);
if (node.implicit) {
Report.warning(lexer, element, node, Report.INSERTING_TAG);
}
parseTag(lexer, node, Lexer.IgnoreWhitespace /*
* Lexer.
* MixedContent
*/);
continue;
}
/* discard unexpected tags */
if (node.type == Node.EndTag) {
lexer.popInline(node); /* if inline end tag */
}
Report.warning(lexer, element, node, Report.DISCARDING_UNEXPECTED);
}
if (!((element.tag.model & Dict.CM_OPT) != 0)) {
Report.warning(lexer, element, node, Report.MISSING_ENDTAG_FOR);
}
if ((element.tag.model & Dict.CM_OBJECT) != 0) {
/* pop inline stack */
while (lexer.istack.size() > lexer.istackbase) {
lexer.popInline(null);
}
lexer.istackbase = istackbase;
}
Node.trimSpaces(lexer, element);
Node.trimEmptyElement(lexer, element);
}
}
;
public static class ParseTableTag implements Parser {
public void parse(Lexer lexer, Node table, short mode) {
Node node, parent;
int istackbase;
TagTable tt = lexer.configuration.tt;
lexer.deferDup();
istackbase = lexer.istackbase;
lexer.istackbase = lexer.istack.size();
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == table.tag && node.type == Node.EndTag) {
lexer.istackbase = istackbase;
table.closed = true;
Node.trimEmptyElement(lexer, table);
return;
}
/* deal with comments etc. */
if (Node.insertMisc(table, node)) {
continue;
}
/* discard unknown tags */
if (node.tag == null && node.type != Node.TextNode) {
Report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* if TD or TH or text or inline or block then infer <TR> */
if (node.type != Node.EndTag) {
if (node.tag == tt.tagTd || node.tag == tt.tagTh || node.tag == tt.tagTable) {
lexer.ungetToken();
node = lexer.inferredTag("tr");
Report.warning(lexer, table, node, Report.MISSING_STARTTAG);
} else if (node.type == Node.TextNode || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
Node.insertNodeBeforeElement(table, node);
Report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN);
lexer.exiled = true;
/*
* AQ: TODO Line 2040 of parser.c (13 Jan 2000) reads as
* follows: if (!node->type == TextNode) This will
* always evaluate to false. This has been reported to
* Dave Raggett <dsr@w3.org>
*/
//Should be?: if (!(node.type == Node.TextNode))
if (false) {
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
lexer.exiled = false;
continue;
} else if ((node.tag.model & Dict.CM_HEAD) != 0) {
moveToHead(lexer, table, node);
continue;
}
}
/*
* if this is the end tag for an ancestor element then infer end
* tag for this element
*/
if (node.type == Node.EndTag) {
if (node.tag == tt.tagForm) {
lexer.badForm = 1;
Report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.tag != null && (node.tag.model & (Dict.CM_TABLE | Dict.CM_ROW)) != 0) {
Report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
continue;
}
for (parent = table.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
Report.warning(lexer, table, node, Report.MISSING_ENDTAG_BEFORE);
lexer.ungetToken();
lexer.istackbase = istackbase;
Node.trimEmptyElement(lexer, table);
return;
}
}
}
if (!((node.tag.model & Dict.CM_TABLE) != 0)) {
lexer.ungetToken();
Report.warning(lexer, table, node, Report.TAG_NOT_ALLOWED_IN);
lexer.istackbase = istackbase;
Node.trimEmptyElement(lexer, table);
return;
}
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
Node.insertNodeAtEnd(table, node);
;
parseTag(lexer, node, Lexer.IgnoreWhitespace);
continue;
}
/* discard unexpected text nodes and end tags */
Report.warning(lexer, table, node, Report.DISCARDING_UNEXPECTED);
}
Report.warning(lexer, table, node, Report.MISSING_ENDTAG_FOR);
Node.trimEmptyElement(lexer, table);
lexer.istackbase = istackbase;
}
}
;
public static class ParseColGroup implements Parser {
public void parse(Lexer lexer, Node colgroup, short mode) {
Node node, parent;
TagTable tt = lexer.configuration.tt;
if ((colgroup.tag.model & Dict.CM_EMPTY) != 0) {
return;
}
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == colgroup.tag && node.type == Node.EndTag) {
colgroup.closed = true;
return;
}
/*
* if this is the end tag for an ancestor element then infer end
* tag for this element
*/
if (node.type == Node.EndTag) {
if (node.tag == tt.tagForm) {
lexer.badForm = 1;
Report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
continue;
}
for (parent = colgroup.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
lexer.ungetToken();
return;
}
}
}
if (node.type == Node.TextNode) {
lexer.ungetToken();
return;
}
/* deal with comments etc. */
if (Node.insertMisc(colgroup, node)) {
continue;
}
/* discard unknown tags */
if (node.tag == null) {
Report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.tag != tt.tagCol) {
lexer.ungetToken();
return;
}
if (node.type == Node.EndTag) {
Report.warning(lexer, colgroup, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* node should be <COL> */
Node.insertNodeAtEnd(colgroup, node);
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
}
}
;
public static class ParseRowGroup implements Parser {
public void parse(Lexer lexer, Node rowgroup, short mode) {
Node node, parent;
TagTable tt = lexer.configuration.tt;
if ((rowgroup.tag.model & Dict.CM_EMPTY) != 0) {
return;
}
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == rowgroup.tag) {
if (node.type == Node.EndTag) {
rowgroup.closed = true;
Node.trimEmptyElement(lexer, rowgroup);
return;
}
lexer.ungetToken();
return;
}
/* if </table> infer end tag */
if (node.tag == tt.tagTable && node.type == Node.EndTag) {
lexer.ungetToken();
Node.trimEmptyElement(lexer, rowgroup);
return;
}
/* deal with comments etc. */
if (Node.insertMisc(rowgroup, node)) {
continue;
}
/* discard unknown tags */
if (node.tag == null && node.type != Node.TextNode) {
Report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/*
* if TD or TH then infer <TR> if text or inline or block move
* before table if head content move to head
*/
if (node.type != Node.EndTag) {
if (node.tag == tt.tagTd || node.tag == tt.tagTh) {
lexer.ungetToken();
node = lexer.inferredTag("tr");
Report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG);
} else if (node.type == Node.TextNode || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
Node.moveBeforeTable(rowgroup, node, tt);
Report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN);
lexer.exiled = true;
if (node.type != Node.TextNode) {
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
lexer.exiled = false;
continue;
} else if ((node.tag.model & Dict.CM_HEAD) != 0) {
Report.warning(lexer, rowgroup, node, Report.TAG_NOT_ALLOWED_IN);
moveToHead(lexer, rowgroup, node);
continue;
}
}
/*
* if this is the end tag for ancestor element then infer end
* tag for this element
*/
if (node.type == Node.EndTag) {
if (node.tag == tt.tagForm) {
lexer.badForm = 1;
Report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.tag == tt.tagTr || node.tag == tt.tagTd || node.tag == tt.tagTh) {
Report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
continue;
}
for (parent = rowgroup.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
lexer.ungetToken();
Node.trimEmptyElement(lexer, rowgroup);
return;
}
}
}
/*
* if THEAD, TFOOT or TBODY then implied end tag
*/
if ((node.tag.model & Dict.CM_ROWGRP) != 0) {
if (node.type != Node.EndTag) {
lexer.ungetToken();
}
Node.trimEmptyElement(lexer, rowgroup);
return;
}
if (node.type == Node.EndTag) {
Report.warning(lexer, rowgroup, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (!(node.tag == tt.tagTr)) {
node = lexer.inferredTag("tr");
Report.warning(lexer, rowgroup, node, Report.MISSING_STARTTAG);
lexer.ungetToken();
}
/* node should be <TR> */
Node.insertNodeAtEnd(rowgroup, node);
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
Node.trimEmptyElement(lexer, rowgroup);
}
}
;
public static class ParseRow implements Parser {
public void parse(Lexer lexer, Node row, short mode) {
Node node, parent;
boolean exclude_state;
TagTable tt = lexer.configuration.tt;
if ((row.tag.model & Dict.CM_EMPTY) != 0) {
return;
}
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == row.tag) {
if (node.type == Node.EndTag) {
row.closed = true;
Node.fixEmptyRow(lexer, row);
return;
}
lexer.ungetToken();
Node.fixEmptyRow(lexer, row);
return;
}
/*
* if this is the end tag for an ancestor element then infer end
* tag for this element
*/
if (node.type == Node.EndTag) {
if (node.tag == tt.tagForm) {
lexer.badForm = 1;
Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
continue;
}
if (node.tag == tt.tagTd || node.tag == tt.tagTh) {
Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
continue;
}
for (parent = row.parent; parent != null; parent = parent.parent) {
if (node.tag == parent.tag) {
lexer.ungetToken();
Node.trimEmptyElement(lexer, row);
return;
}
}
}
/* deal with comments etc. */
if (Node.insertMisc(row, node)) {
continue;
}
/* discard unknown tags */
if (node.tag == null && node.type != Node.TextNode) {
Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* discard unexpected <table> element */
if (node.tag == tt.tagTable) {
Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* THEAD, TFOOT or TBODY */
if (node.tag != null && (node.tag.model & Dict.CM_ROWGRP) != 0) {
lexer.ungetToken();
Node.trimEmptyElement(lexer, row);
return;
}
if (node.type == Node.EndTag) {
Report.warning(lexer, row, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/*
* if text or inline or block move before table if head content
* move to head
*/
if (node.type != Node.EndTag) {
if (node.tag == tt.tagForm) {
lexer.ungetToken();
node = lexer.inferredTag("td");
Report.warning(lexer, row, node, Report.MISSING_STARTTAG);
} else if (node.type == Node.TextNode || (node.tag.model & (Dict.CM_BLOCK | Dict.CM_INLINE)) != 0) {
Node.moveBeforeTable(row, node, tt);
Report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
lexer.exiled = true;
if (node.type != Node.TextNode) {
parseTag(lexer, node, Lexer.IgnoreWhitespace);
}
lexer.exiled = false;
continue;
} else if ((node.tag.model & Dict.CM_HEAD) != 0) {
Report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
moveToHead(lexer, row, node);
continue;
}
}
if (!(node.tag == tt.tagTd || node.tag == tt.tagTh)) {
Report.warning(lexer, row, node, Report.TAG_NOT_ALLOWED_IN);
continue;
}
/* node should be <TD> or <TH> */
Node.insertNodeAtEnd(row, node);
exclude_state = lexer.excludeBlocks;
lexer.excludeBlocks = false;
parseTag(lexer, node, Lexer.IgnoreWhitespace);
lexer.excludeBlocks = exclude_state;
/* pop inline stack */
while (lexer.istack.size() > lexer.istackbase) {
lexer.popInline(null);
}
}
Node.trimEmptyElement(lexer, row);
}
}
;
public static class ParseNoFrames implements Parser {
public void parse(Lexer lexer, Node noframes, short mode) {
Node node;
boolean checkstack;
TagTable tt = lexer.configuration.tt;
lexer.badAccess |= Report.USING_NOFRAMES;
mode = Lexer.IgnoreWhitespace;
checkstack = true;
while (true) {
node = lexer.getToken(mode);
if (node == null) {
break;
}
if (node.tag == noframes.tag && node.type == Node.EndTag) {
noframes.closed = true;
Node.trimSpaces(lexer, noframes);
return;
}
if (node.tag == tt.tagFrame || node.tag == tt.tagFrameset) {
Report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_BEFORE);
Node.trimSpaces(lexer, noframes);
lexer.ungetToken();
return;
}
if (node.tag == tt.tagHtml) {
if (node.type == Node.StartTag || node.type == Node.StartEndTag) {
Report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
}
continue;
}
/* deal with comments etc. */
if (Node.insertMisc(noframes, node)) {
continue;
}
if (node.tag == tt.tagBody && node.type == Node.StartTag) {
Node.insertNodeAtEnd(noframes, node);
parseTag(lexer, node, Lexer.IgnoreWhitespace /* MixedContent */);
continue;
}
/* implicit body element inferred */
if (node.type == Node.TextNode || node.tag != null) {
lexer.ungetToken();
node = lexer.inferredTag("body");
if (lexer.configuration.XmlOut) {
Report.warning(lexer, noframes, node, Report.INSERTING_TAG);
}
Node.insertNodeAtEnd(noframes, node);
parseTag(lexer, node, Lexer.IgnoreWhitespace /* MixedContent */);
continue;
}
/* discard unexpected end tags */
Report.warning(lexer, noframes, node, Report.DISCARDING_UNEXPECTED);
}
Report.warning(lexer, noframes, node, Report.MISSING_ENDTAG_FOR);
}
}
;
public static class ParseSelect implements Parser {
public void parse(Lexer lexer, Node field, short mode) {
Node node;
TagTable tt = lexer.configuration.tt;
lexer.insert = -1; /* defer implicit inline start tags */
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == field.tag && node.type == Node.EndTag) {
field.closed = true;
Node.trimSpaces(lexer, field);
return;
}
/* deal with comments etc. */
if (Node.insertMisc(field, node)) {
continue;
}
if (node.type == Node.StartTag
&& (node.tag == tt.tagOption || node.tag == tt.tagOptgroup || node.tag == tt.tagScript)) {
Node.insertNodeAtEnd(field, node);
parseTag(lexer, node, Lexer.IgnoreWhitespace);
continue;
}
/* discard unexpected tags */
Report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
}
Report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR);
}
}
;
public static class ParseText implements Parser {
public void parse(Lexer lexer, Node field, short mode) {
Node node;
TagTable tt = lexer.configuration.tt;
lexer.insert = -1; /* defer implicit inline start tags */
if (field.tag == tt.tagTextarea) {
mode = Lexer.Preformatted;
}
while (true) {
node = lexer.getToken(mode);
if (node == null) {
break;
}
if (node.tag == field.tag && node.type == Node.EndTag) {
field.closed = true;
Node.trimSpaces(lexer, field);
return;
}
/* deal with comments etc. */
if (Node.insertMisc(field, node)) {
continue;
}
if (node.type == Node.TextNode) {
/* only called for 1st child */
if (field.content == null && !((mode & Lexer.Preformatted) != 0)) {
Node.trimSpaces(lexer, field);
}
if (node.start >= node.end) {
continue;
}
Node.insertNodeAtEnd(field, node);
continue;
}
if (node.tag == tt.tagFont) {
Report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
continue;
}
/* terminate element on other tags */
if (!((field.tag.model & Dict.CM_OPT) != 0)) {
Report.warning(lexer, field, node, Report.MISSING_ENDTAG_BEFORE);
}
lexer.ungetToken();
Node.trimSpaces(lexer, field);
return;
}
if (!((field.tag.model & Dict.CM_OPT) != 0)) {
Report.warning(lexer, field, node, Report.MISSING_ENDTAG_FOR);
}
}
}
;
public static class ParseOptGroup implements Parser {
public void parse(Lexer lexer, Node field, short mode) {
Node node;
TagTable tt = lexer.configuration.tt;
lexer.insert = -1; /* defer implicit inline start tags */
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
if (node.tag == field.tag && node.type == Node.EndTag) {
field.closed = true;
Node.trimSpaces(lexer, field);
return;
}
/* deal with comments etc. */
if (Node.insertMisc(field, node)) {
continue;
}
if (node.type == Node.StartTag && (node.tag == tt.tagOption || node.tag == tt.tagOptgroup)) {
if (node.tag == tt.tagOptgroup) {
Report.warning(lexer, field, node, Report.CANT_BE_NESTED);
}
Node.insertNodeAtEnd(field, node);
parseTag(lexer, node, Lexer.MixedContent);
continue;
}
/* discard unexpected tags */
Report.warning(lexer, field, node, Report.DISCARDING_UNEXPECTED);
}
}
}
;
public static Parser getParseHTML() {
return _parseHTML;
}
public static Parser getParseHead() {
return _parseHead;
}
public static Parser getParseTitle() {
return _parseTitle;
}
public static Parser getParseScript() {
return _parseScript;
}
public static Parser getParseBody() {
return _parseBody;
}
public static Parser getParseFrameSet() {
return _parseFrameSet;
}
public static Parser getParseInline() {
return _parseInline;
}
public static Parser getParseList() {
return _parseList;
}
public static Parser getParseDefList() {
return _parseDefList;
}
public static Parser getParsePre() {
return _parsePre;
}
public static Parser getParseBlock() {
return _parseBlock;
}
public static Parser getParseTableTag() {
return _parseTableTag;
}
public static Parser getParseColGroup() {
return _parseColGroup;
}
public static Parser getParseRowGroup() {
return _parseRowGroup;
}
public static Parser getParseRow() {
return _parseRow;
}
public static Parser getParseNoFrames() {
return _parseNoFrames;
}
public static Parser getParseSelect() {
return _parseSelect;
}
public static Parser getParseText() {
return _parseText;
}
public static Parser getParseOptGroup() {
return _parseOptGroup;
}
private static Parser _parseHTML = new ParseHTML();
private static Parser _parseHead = new ParseHead();
private static Parser _parseTitle = new ParseTitle();
private static Parser _parseScript = new ParseScript();
private static Parser _parseBody = new ParseBody();
private static Parser _parseFrameSet = new ParseFrameSet();
private static Parser _parseInline = new ParseInline();
private static Parser _parseList = new ParseList();
private static Parser _parseDefList = new ParseDefList();
private static Parser _parsePre = new ParsePre();
private static Parser _parseBlock = new ParseBlock();
private static Parser _parseTableTag = new ParseTableTag();
private static Parser _parseColGroup = new ParseColGroup();
private static Parser _parseRowGroup = new ParseRowGroup();
private static Parser _parseRow = new ParseRow();
private static Parser _parseNoFrames = new ParseNoFrames();
private static Parser _parseSelect = new ParseSelect();
private static Parser _parseText = new ParseText();
private static Parser _parseOptGroup = new ParseOptGroup();
/*
* HTML is the top level element
*/
public static Node parseDocument(Lexer lexer) {
Node node, document, html;
Node doctype = null;
TagTable tt = lexer.configuration.tt;
document = lexer.newNode();
document.type = Node.RootNode;
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
/* deal with comments etc. */
if (Node.insertMisc(document, node)) {
continue;
}
if (node.type == Node.DocTypeTag) {
if (doctype == null) {
Node.insertNodeAtEnd(document, node);
doctype = node;
} else {
Report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED);
}
continue;
}
if (node.type == Node.EndTag) {
Report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); //TODO?
continue;
}
if (node.type != Node.StartTag || node.tag != tt.tagHtml) {
lexer.ungetToken();
html = lexer.inferredTag("html");
} else {
html = node;
}
Node.insertNodeAtEnd(document, html);
getParseHTML().parse(lexer, html, (short) 0); // TODO?
break;
}
return document;
}
/**
* Indicates whether or not whitespace should be preserved for this element.
* If an <code>xml:space</code> attribute is found, then if the attribute
* value is <code>preserve</code>, returns <code>true</code>. For any other
* value, returns <code>false</code>. If an <code>xml:space</code> attribute
* was <em>not</em> found, then the following element names result in a
* return value of <code>true:
* pre, script, style,</code> and <code>xsl:text</code>. Finally, if a
* <code>TagTable</code> was passed in and the element appears as the "pre"
* element in the <code>TagTable</code>, then <code>true</code> will be
* returned. Otherwise, <code>false</code> is returned.
*
* @param element The <code>Node</code> to test to see if whitespace should
* be preserved.
* @param tt The <code>TagTable</code> to test for the
* <code>getNodePre()</code> function. This may be
* <code>null</code>, in which case this test is bypassed.
* @return <code>true</code> or <code>false</code>, as explained above.
*/
public static boolean XMLPreserveWhiteSpace(Node element, TagTable tt) {
AttVal attribute;
/* search attributes for xml:space */
for (attribute = element.attributes; attribute != null; attribute = attribute.next) {
if (attribute.attribute.equals("xml:space")) {
if (attribute.value.equals("preserve")) {
return true;
}
return false;
}
}
/* kludge for html docs without explicit xml:space attribute */
if (Lexer.wstrcasecmp(element.element, "pre") == 0 || Lexer.wstrcasecmp(element.element, "script") == 0
|| Lexer.wstrcasecmp(element.element, "style") == 0) {
return true;
}
if (tt != null && tt.findParser(element) == getParsePre()) {
return true;
}
/* kludge for XSL docs */
if (Lexer.wstrcasecmp(element.element, "xsl:text") == 0) {
return true;
}
return false;
}
/*
* XML documents
*/
public static void parseXMLElement(Lexer lexer, Node element, short mode) {
Node node;
/* Jeff Young's kludge for XSL docs */
if (Lexer.wstrcasecmp(element.element, "xsl:text") == 0) {
return;
}
/* if node is pre or has xml:space="preserve" then do so */
if (XMLPreserveWhiteSpace(element, lexer.configuration.tt)) {
mode = Lexer.Preformatted;
}
while (true) {
node = lexer.getToken(mode);
if (node == null) {
break;
}
if (node.type == Node.EndTag && node.element.equals(element.element)) {
element.closed = true;
break;
}
/* discard unexpected end tags */
if (node.type == Node.EndTag) {
Report.error(lexer, element, node, Report.UNEXPECTED_ENDTAG);
continue;
}
/* parse content on seeing start tag */
if (node.type == Node.StartTag) {
parseXMLElement(lexer, node, mode);
}
Node.insertNodeAtEnd(element, node);
}
/*
* if first child is text then trim initial space and delete text node
* if it is empty.
*/
node = element.content;
if (node != null && node.type == Node.TextNode && mode != Lexer.Preformatted) {
if (node.textarray[node.start] == (byte) ' ') {
node.start++;
if (node.start >= node.end) {
Node.discardElement(node);
}
}
}
/*
* if last child is text then trim final space and delete the text node
* if it is empty
*/
node = element.last;
if (node != null && node.type == Node.TextNode && mode != Lexer.Preformatted) {
if (node.textarray[node.end - 1] == (byte) ' ') {
node.end--;
if (node.start >= node.end) {
Node.discardElement(node);
}
}
}
}
public static Node parseXMLDocument(Lexer lexer) {
Node node, document, doctype;
document = lexer.newNode();
document.type = Node.RootNode;
doctype = null;
lexer.configuration.XmlTags = true;
while (true) {
node = lexer.getToken(Lexer.IgnoreWhitespace);
if (node == null) {
break;
}
/* discard unexpected end tags */
if (node.type == Node.EndTag) {
Report.warning(lexer, null, node, Report.UNEXPECTED_ENDTAG);
continue;
}
/* deal with comments etc. */
if (Node.insertMisc(document, node)) {
continue;
}
if (node.type == Node.DocTypeTag) {
if (doctype == null) {
Node.insertNodeAtEnd(document, node);
doctype = node;
} else {
Report.warning(lexer, document, node, Report.DISCARDING_UNEXPECTED); // TODO
}
continue;
}
/* if start tag then parse element's content */
if (node.type == Node.StartTag) {
Node.insertNodeAtEnd(document, node);
parseXMLElement(lexer, node, Lexer.IgnoreWhitespace);
}
}
if (false) { //#if 0
/* discard the document type */
node = document.findDocType();
if (node != null) {
Node.discardElement(node);
}
} // #endif
if (doctype != null && !lexer.checkDocTypeKeyWords(doctype)) {
Report.warning(lexer, doctype, null, Report.DTYPE_NOT_UPPER_CASE);
}
/* ensure presence of initial <?XML version="1.0"?> */
if (lexer.configuration.XmlPi) {
lexer.fixXMLPI(document);
}
return document;
}
public static boolean isJavaScript(Node node) {
boolean result = false;
AttVal attr;
if (node.attributes == null) {
return true;
}
for (attr = node.attributes; attr != null; attr = attr.next) {
if ((Lexer.wstrcasecmp(attr.attribute, "language") == 0 || Lexer.wstrcasecmp(attr.attribute, "type") == 0)
&& Lexer.wsubstr(attr.value, "javascript")) {
result = true;
}
}
return result;
}
}