diff options
Diffstat (limited to 'parser/html/javasrc/Tokenizer.java')
-rw-r--r-- | parser/html/javasrc/Tokenizer.java | 7064 |
1 files changed, 0 insertions, 7064 deletions
diff --git a/parser/html/javasrc/Tokenizer.java b/parser/html/javasrc/Tokenizer.java deleted file mode 100644 index 70e1df75c1..0000000000 --- a/parser/html/javasrc/Tokenizer.java +++ /dev/null @@ -1,7064 +0,0 @@ -/* - * Copyright (c) 2005-2007 Henri Sivonen - * Copyright (c) 2007-2015 Mozilla Foundation - * Portions of comments Copyright 2004-2010 Apple Computer, Inc., Mozilla - * Foundation, and Opera Software ASA. - * - * Permission is hereby granted, free of charge, to any person obtaining a - * copy of this software and associated documentation files (the "Software"), - * to deal in the Software without restriction, including without limitation - * the rights to use, copy, modify, merge, publish, distribute, sublicense, - * and/or sell copies of the Software, and to permit persons to whom the - * Software is furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in - * all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL - * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING - * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER - * DEALINGS IN THE SOFTWARE. - */ - -/* - * The comments following this one that use the same comment syntax as this - * comment are quotes from the WHATWG HTML 5 spec as of 2 June 2007 - * amended as of June 18 2008 and May 31 2010. - * That document came with this statement: - * "© Copyright 2004-2010 Apple Computer, Inc., Mozilla Foundation, and - * Opera Software ASA. You are granted a license to use, reproduce and - * create derivative works of this document." - */ - -package nu.validator.htmlparser.impl; - -import org.xml.sax.ErrorHandler; -import org.xml.sax.Locator; -import org.xml.sax.SAXException; -import org.xml.sax.SAXParseException; - -import nu.validator.htmlparser.annotation.Auto; -import nu.validator.htmlparser.annotation.CharacterName; -import nu.validator.htmlparser.annotation.Const; -import nu.validator.htmlparser.annotation.Inline; -import nu.validator.htmlparser.annotation.Local; -import nu.validator.htmlparser.annotation.NoLength; -import nu.validator.htmlparser.common.EncodingDeclarationHandler; -import nu.validator.htmlparser.common.Interner; -import nu.validator.htmlparser.common.TokenHandler; -import nu.validator.htmlparser.common.XmlViolationPolicy; - -/** - * An implementation of - * https://html.spec.whatwg.org/multipage/syntax.html#tokenization - * - * This class implements the <code>Locator</code> interface. This is not an - * incidental implementation detail: Users of this class are encouraged to make - * use of the <code>Locator</code> nature. - * - * By default, the tokenizer may report data that XML 1.0 bans. The tokenizer - * can be configured to treat these conditions as fatal or to coerce the infoset - * to something that XML 1.0 allows. - * - * @version $Id$ - * @author hsivonen - */ -public class Tokenizer implements Locator { - - private static final int DATA_AND_RCDATA_MASK = ~1; - - public static final int DATA = 0; - - public static final int RCDATA = 1; - - public static final int SCRIPT_DATA = 2; - - public static final int RAWTEXT = 3; - - public static final int SCRIPT_DATA_ESCAPED = 4; - - public static final int ATTRIBUTE_VALUE_DOUBLE_QUOTED = 5; - - public static final int ATTRIBUTE_VALUE_SINGLE_QUOTED = 6; - - public static final int ATTRIBUTE_VALUE_UNQUOTED = 7; - - public static final int PLAINTEXT = 8; - - public static final int TAG_OPEN = 9; - - public static final int CLOSE_TAG_OPEN = 10; - - public static final int TAG_NAME = 11; - - public static final int BEFORE_ATTRIBUTE_NAME = 12; - - public static final int ATTRIBUTE_NAME = 13; - - public static final int AFTER_ATTRIBUTE_NAME = 14; - - public static final int BEFORE_ATTRIBUTE_VALUE = 15; - - public static final int AFTER_ATTRIBUTE_VALUE_QUOTED = 16; - - public static final int BOGUS_COMMENT = 17; - - public static final int MARKUP_DECLARATION_OPEN = 18; - - public static final int DOCTYPE = 19; - - public static final int BEFORE_DOCTYPE_NAME = 20; - - public static final int DOCTYPE_NAME = 21; - - public static final int AFTER_DOCTYPE_NAME = 22; - - public static final int BEFORE_DOCTYPE_PUBLIC_IDENTIFIER = 23; - - public static final int DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED = 24; - - public static final int DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED = 25; - - public static final int AFTER_DOCTYPE_PUBLIC_IDENTIFIER = 26; - - public static final int BEFORE_DOCTYPE_SYSTEM_IDENTIFIER = 27; - - public static final int DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED = 28; - - public static final int DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED = 29; - - public static final int AFTER_DOCTYPE_SYSTEM_IDENTIFIER = 30; - - public static final int BOGUS_DOCTYPE = 31; - - public static final int COMMENT_START = 32; - - public static final int COMMENT_START_DASH = 33; - - public static final int COMMENT = 34; - - public static final int COMMENT_END_DASH = 35; - - public static final int COMMENT_END = 36; - - public static final int COMMENT_END_BANG = 37; - - public static final int NON_DATA_END_TAG_NAME = 38; - - public static final int MARKUP_DECLARATION_HYPHEN = 39; - - public static final int MARKUP_DECLARATION_OCTYPE = 40; - - public static final int DOCTYPE_UBLIC = 41; - - public static final int DOCTYPE_YSTEM = 42; - - public static final int AFTER_DOCTYPE_PUBLIC_KEYWORD = 43; - - public static final int BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS = 44; - - public static final int AFTER_DOCTYPE_SYSTEM_KEYWORD = 45; - - public static final int CONSUME_CHARACTER_REFERENCE = 46; - - public static final int CONSUME_NCR = 47; - - public static final int CHARACTER_REFERENCE_TAIL = 48; - - public static final int HEX_NCR_LOOP = 49; - - public static final int DECIMAL_NRC_LOOP = 50; - - public static final int HANDLE_NCR_VALUE = 51; - - public static final int HANDLE_NCR_VALUE_RECONSUME = 52; - - public static final int CHARACTER_REFERENCE_HILO_LOOKUP = 53; - - public static final int SELF_CLOSING_START_TAG = 54; - - public static final int CDATA_START = 55; - - public static final int CDATA_SECTION = 56; - - public static final int CDATA_RSQB = 57; - - public static final int CDATA_RSQB_RSQB = 58; - - public static final int SCRIPT_DATA_LESS_THAN_SIGN = 59; - - public static final int SCRIPT_DATA_ESCAPE_START = 60; - - public static final int SCRIPT_DATA_ESCAPE_START_DASH = 61; - - public static final int SCRIPT_DATA_ESCAPED_DASH = 62; - - public static final int SCRIPT_DATA_ESCAPED_DASH_DASH = 63; - - public static final int BOGUS_COMMENT_HYPHEN = 64; - - public static final int RAWTEXT_RCDATA_LESS_THAN_SIGN = 65; - - public static final int SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN = 66; - - public static final int SCRIPT_DATA_DOUBLE_ESCAPE_START = 67; - - public static final int SCRIPT_DATA_DOUBLE_ESCAPED = 68; - - public static final int SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN = 69; - - public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH = 70; - - public static final int SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH = 71; - - public static final int SCRIPT_DATA_DOUBLE_ESCAPE_END = 72; - - public static final int PROCESSING_INSTRUCTION = 73; - - public static final int PROCESSING_INSTRUCTION_QUESTION_MARK = 74; - - /** - * Magic value for UTF-16 operations. - */ - private static final int LEAD_OFFSET = (0xD800 - (0x10000 >> 10)); - - /** - * UTF-16 code unit array containing less than and greater than for emitting - * those characters on certain parse errors. - */ - private static final @NoLength char[] LT_GT = { '<', '>' }; - - /** - * UTF-16 code unit array containing less than and solidus for emitting - * those characters on certain parse errors. - */ - private static final @NoLength char[] LT_SOLIDUS = { '<', '/' }; - - /** - * UTF-16 code unit array containing ]] for emitting those characters on - * state transitions. - */ - private static final @NoLength char[] RSQB_RSQB = { ']', ']' }; - - /** - * Array version of U+FFFD. - */ - private static final @NoLength char[] REPLACEMENT_CHARACTER = { '\uFFFD' }; - - // [NOCPP[ - - /** - * Array version of space. - */ - private static final @NoLength char[] SPACE = { ' ' }; - - // ]NOCPP] - - /** - * Array version of line feed. - */ - private static final @NoLength char[] LF = { '\n' }; - - /** - * "CDATA[" as <code>char[]</code> - */ - private static final @NoLength char[] CDATA_LSQB = { 'C', 'D', 'A', 'T', - 'A', '[' }; - - /** - * "octype" as <code>char[]</code> - */ - private static final @NoLength char[] OCTYPE = { 'o', 'c', 't', 'y', 'p', - 'e' }; - - /** - * "ublic" as <code>char[]</code> - */ - private static final @NoLength char[] UBLIC = { 'u', 'b', 'l', 'i', 'c' }; - - /** - * "ystem" as <code>char[]</code> - */ - private static final @NoLength char[] YSTEM = { 'y', 's', 't', 'e', 'm' }; - - private static final char[] TITLE_ARR = { 't', 'i', 't', 'l', 'e' }; - - private static final char[] SCRIPT_ARR = { 's', 'c', 'r', 'i', 'p', 't' }; - - private static final char[] STYLE_ARR = { 's', 't', 'y', 'l', 'e' }; - - private static final char[] PLAINTEXT_ARR = { 'p', 'l', 'a', 'i', 'n', 't', - 'e', 'x', 't' }; - - private static final char[] XMP_ARR = { 'x', 'm', 'p' }; - - private static final char[] TEXTAREA_ARR = { 't', 'e', 'x', 't', 'a', 'r', - 'e', 'a' }; - - private static final char[] IFRAME_ARR = { 'i', 'f', 'r', 'a', 'm', 'e' }; - - private static final char[] NOEMBED_ARR = { 'n', 'o', 'e', 'm', 'b', 'e', - 'd' }; - - private static final char[] NOSCRIPT_ARR = { 'n', 'o', 's', 'c', 'r', 'i', - 'p', 't' }; - - private static final char[] NOFRAMES_ARR = { 'n', 'o', 'f', 'r', 'a', 'm', - 'e', 's' }; - - /** - * The token handler. - */ - protected final TokenHandler tokenHandler; - - protected EncodingDeclarationHandler encodingDeclarationHandler; - - // [NOCPP[ - - /** - * The error handler. - */ - protected ErrorHandler errorHandler; - - // ]NOCPP] - - /** - * Whether the previous char read was CR. - */ - protected boolean lastCR; - - protected int stateSave; - - private int returnStateSave; - - protected int index; - - private boolean forceQuirks; - - private char additional; - - private int entCol; - - private int firstCharKey; - - private int lo; - - private int hi; - - private int candidate; - - private int charRefBufMark; - - protected int value; - - private boolean seenDigits; - - protected int cstart; - - /** - * The SAX public id for the resource being tokenized. (Only passed to back - * as part of locator data.) - */ - private String publicId; - - /** - * The SAX system id for the resource being tokenized. (Only passed to back - * as part of locator data.) - */ - private String systemId; - - /** - * Buffer for bufferable things other than those that fit the description - * of <code>charRefBuf</code>. - */ - private @Auto char[] strBuf; - - /** - * Number of significant <code>char</code>s in <code>strBuf</code>. - */ - private int strBufLen; - - /** - * Buffer for characters that might form a character reference but may - * end up not forming one. - */ - private final @Auto char[] charRefBuf; - - /** - * Number of significant <code>char</code>s in <code>charRefBuf</code>. - */ - private int charRefBufLen; - - /** - * Buffer for expanding NCRs falling into the Basic Multilingual Plane. - */ - private final @Auto char[] bmpChar; - - /** - * Buffer for expanding astral NCRs. - */ - private final @Auto char[] astralChar; - - /** - * The element whose end tag closes the current CDATA or RCDATA element. - */ - protected ElementName endTagExpectation = null; - - private char[] endTagExpectationAsArray; // not @Auto! - - /** - * <code>true</code> if tokenizing an end tag - */ - protected boolean endTag; - - /** - * The current tag token name. - */ - private ElementName tagName = null; - - /** - * The current attribute name. - */ - protected AttributeName attributeName = null; - - // [NOCPP[ - - /** - * Whether comment tokens are emitted. - */ - private boolean wantsComments = false; - - /** - * <code>true</code> when HTML4-specific additional errors are requested. - */ - protected boolean html4; - - /** - * Whether the stream is past the first 1024 bytes. - */ - private boolean metaBoundaryPassed; - - // ]NOCPP] - - /** - * The name of the current doctype token. - */ - private @Local String doctypeName; - - /** - * The public id of the current doctype token. - */ - private String publicIdentifier; - - /** - * The system id of the current doctype token. - */ - private String systemIdentifier; - - /** - * The attribute holder. - */ - private HtmlAttributes attributes; - - // [NOCPP[ - - /** - * The policy for vertical tab and form feed. - */ - private XmlViolationPolicy contentSpacePolicy = XmlViolationPolicy.ALTER_INFOSET; - - /** - * The policy for comments. - */ - private XmlViolationPolicy commentPolicy = XmlViolationPolicy.ALTER_INFOSET; - - private XmlViolationPolicy xmlnsPolicy = XmlViolationPolicy.ALTER_INFOSET; - - private XmlViolationPolicy namePolicy = XmlViolationPolicy.ALTER_INFOSET; - - private boolean html4ModeCompatibleWithXhtml1Schemata; - - private int mappingLangToXmlLang; - - // ]NOCPP] - - private final boolean newAttributesEachTime; - - private boolean shouldSuspend; - - protected boolean confident; - - private int line; - - /* - * The line number of the current attribute. First set to the line of the - * attribute name and if there is a value, set to the line the value - * started on. - */ - // CPPONLY: private int attributeLine; - - private Interner interner; - - // CPPONLY: private boolean viewingXmlSource; - - // [NOCPP[ - - protected LocatorImpl ampersandLocation; - - public Tokenizer(TokenHandler tokenHandler, boolean newAttributesEachTime) { - this.tokenHandler = tokenHandler; - this.encodingDeclarationHandler = null; - this.newAttributesEachTime = newAttributesEachTime; - // ∳ is the longest valid char ref and - // the semicolon never gets appended to the buffer. - this.charRefBuf = new char[32]; - this.bmpChar = new char[1]; - this.astralChar = new char[2]; - this.tagName = null; - this.attributeName = null; - this.doctypeName = null; - this.publicIdentifier = null; - this.systemIdentifier = null; - this.attributes = null; - } - - // ]NOCPP] - - /** - * The constructor. - * - * @param tokenHandler - * the handler for receiving tokens - */ - public Tokenizer(TokenHandler tokenHandler - // CPPONLY: , boolean viewingXmlSource - ) { - this.tokenHandler = tokenHandler; - this.encodingDeclarationHandler = null; - // [NOCPP[ - this.newAttributesEachTime = false; - // ]NOCPP] - // ∳ is the longest valid char ref and - // the semicolon never gets appended to the buffer. - this.charRefBuf = new char[32]; - this.bmpChar = new char[1]; - this.astralChar = new char[2]; - this.tagName = null; - this.attributeName = null; - this.doctypeName = null; - this.publicIdentifier = null; - this.systemIdentifier = null; - // [NOCPP[ - this.attributes = null; - // ]NOCPP] - // CPPONLY: this.attributes = tokenHandler.HasBuilder() ? new HtmlAttributes(mappingLangToXmlLang) : null; - // CPPONLY: this.newAttributesEachTime = !tokenHandler.HasBuilder(); - // CPPONLY: this.viewingXmlSource = viewingXmlSource; - } - - public void setInterner(Interner interner) { - this.interner = interner; - } - - public void initLocation(String newPublicId, String newSystemId) { - this.systemId = newSystemId; - this.publicId = newPublicId; - - } - - // CPPONLY: boolean isViewingXmlSource() { - // CPPONLY: return viewingXmlSource; - // CPPONLY: } - - // [NOCPP[ - - /** - * Returns the mappingLangToXmlLang. - * - * @return the mappingLangToXmlLang - */ - public boolean isMappingLangToXmlLang() { - return mappingLangToXmlLang == AttributeName.HTML_LANG; - } - - /** - * Sets the mappingLangToXmlLang. - * - * @param mappingLangToXmlLang - * the mappingLangToXmlLang to set - */ - public void setMappingLangToXmlLang(boolean mappingLangToXmlLang) { - this.mappingLangToXmlLang = mappingLangToXmlLang ? AttributeName.HTML_LANG - : AttributeName.HTML; - } - - /** - * Sets the error handler. - * - * @see org.xml.sax.XMLReader#setErrorHandler(org.xml.sax.ErrorHandler) - */ - public void setErrorHandler(ErrorHandler eh) { - this.errorHandler = eh; - } - - public ErrorHandler getErrorHandler() { - return this.errorHandler; - } - - /** - * Sets the commentPolicy. - * - * @param commentPolicy - * the commentPolicy to set - */ - public void setCommentPolicy(XmlViolationPolicy commentPolicy) { - this.commentPolicy = commentPolicy; - } - - /** - * Sets the contentNonXmlCharPolicy. - * - * @param contentNonXmlCharPolicy - * the contentNonXmlCharPolicy to set - */ - public void setContentNonXmlCharPolicy( - XmlViolationPolicy contentNonXmlCharPolicy) { - if (contentNonXmlCharPolicy != XmlViolationPolicy.ALLOW) { - throw new IllegalArgumentException( - "Must use ErrorReportingTokenizer to set contentNonXmlCharPolicy to non-ALLOW."); - } - } - - /** - * Sets the contentSpacePolicy. - * - * @param contentSpacePolicy - * the contentSpacePolicy to set - */ - public void setContentSpacePolicy(XmlViolationPolicy contentSpacePolicy) { - this.contentSpacePolicy = contentSpacePolicy; - } - - /** - * Sets the xmlnsPolicy. - * - * @param xmlnsPolicy - * the xmlnsPolicy to set - */ - public void setXmlnsPolicy(XmlViolationPolicy xmlnsPolicy) { - if (xmlnsPolicy == XmlViolationPolicy.FATAL) { - throw new IllegalArgumentException("Can't use FATAL here."); - } - this.xmlnsPolicy = xmlnsPolicy; - } - - public void setNamePolicy(XmlViolationPolicy namePolicy) { - this.namePolicy = namePolicy; - } - - /** - * Sets the html4ModeCompatibleWithXhtml1Schemata. - * - * @param html4ModeCompatibleWithXhtml1Schemata - * the html4ModeCompatibleWithXhtml1Schemata to set - */ - public void setHtml4ModeCompatibleWithXhtml1Schemata( - boolean html4ModeCompatibleWithXhtml1Schemata) { - this.html4ModeCompatibleWithXhtml1Schemata = html4ModeCompatibleWithXhtml1Schemata; - } - - // ]NOCPP] - - // For the token handler to call - /** - * Sets the tokenizer state and the associated element name. This should - * only ever used to put the tokenizer into one of the states that have - * a special end tag expectation. - * - * @param specialTokenizerState - * the tokenizer state to set - * @param endTagExpectation - * the expected end tag for transitioning back to normal - */ - public void setStateAndEndTagExpectation(int specialTokenizerState, - @Local String endTagExpectation) { - this.stateSave = specialTokenizerState; - if (specialTokenizerState == Tokenizer.DATA) { - return; - } - @Auto char[] asArray = Portability.newCharArrayFromLocal(endTagExpectation); - this.endTagExpectation = ElementName.elementNameByBuffer(asArray, 0, - asArray.length, interner); - endTagExpectationToArray(); - } - - /** - * Sets the tokenizer state and the associated element name. This should - * only ever used to put the tokenizer into one of the states that have - * a special end tag expectation. - * - * @param specialTokenizerState - * the tokenizer state to set - * @param endTagExpectation - * the expected end tag for transitioning back to normal - */ - public void setStateAndEndTagExpectation(int specialTokenizerState, - ElementName endTagExpectation) { - this.stateSave = specialTokenizerState; - this.endTagExpectation = endTagExpectation; - endTagExpectationToArray(); - } - - private void endTagExpectationToArray() { - switch (endTagExpectation.getGroup()) { - case TreeBuilder.TITLE: - endTagExpectationAsArray = TITLE_ARR; - return; - case TreeBuilder.SCRIPT: - endTagExpectationAsArray = SCRIPT_ARR; - return; - case TreeBuilder.STYLE: - endTagExpectationAsArray = STYLE_ARR; - return; - case TreeBuilder.PLAINTEXT: - endTagExpectationAsArray = PLAINTEXT_ARR; - return; - case TreeBuilder.XMP: - endTagExpectationAsArray = XMP_ARR; - return; - case TreeBuilder.TEXTAREA: - endTagExpectationAsArray = TEXTAREA_ARR; - return; - case TreeBuilder.IFRAME: - endTagExpectationAsArray = IFRAME_ARR; - return; - case TreeBuilder.NOEMBED: - endTagExpectationAsArray = NOEMBED_ARR; - return; - case TreeBuilder.NOSCRIPT: - endTagExpectationAsArray = NOSCRIPT_ARR; - return; - case TreeBuilder.NOFRAMES: - endTagExpectationAsArray = NOFRAMES_ARR; - return; - default: - assert false: "Bad end tag expectation."; - return; - } - } - - /** - * For C++ use only. - */ - public void setLineNumber(int line) { - // CPPONLY: this.attributeLine = line; // XXX is this needed? - this.line = line; - } - - // start Locator impl - - /** - * @see org.xml.sax.Locator#getLineNumber() - */ - @Inline public int getLineNumber() { - return line; - } - - // [NOCPP[ - - /** - * @see org.xml.sax.Locator#getColumnNumber() - */ - @Inline public int getColumnNumber() { - return -1; - } - - /** - * @see org.xml.sax.Locator#getPublicId() - */ - public String getPublicId() { - return publicId; - } - - /** - * @see org.xml.sax.Locator#getSystemId() - */ - public String getSystemId() { - return systemId; - } - - // end Locator impl - - // end public API - - public void notifyAboutMetaBoundary() { - metaBoundaryPassed = true; - } - - void turnOnAdditionalHtml4Errors() { - html4 = true; - } - - // ]NOCPP] - - HtmlAttributes emptyAttributes() { - // [NOCPP[ - if (newAttributesEachTime) { - return new HtmlAttributes(mappingLangToXmlLang); - } else { - // ]NOCPP] - return HtmlAttributes.EMPTY_ATTRIBUTES; - // [NOCPP[ - } - // ]NOCPP] - } - - @Inline private void appendCharRefBuf(char c) { - // CPPONLY: assert charRefBufLen < charRefBuf.length: - // CPPONLY: "RELEASE: Attempted to overrun charRefBuf!"; - charRefBuf[charRefBufLen++] = c; - } - - private void emitOrAppendCharRefBuf(int returnState) throws SAXException { - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - appendCharRefBufToStrBuf(); - } else { - if (charRefBufLen > 0) { - tokenHandler.characters(charRefBuf, 0, charRefBufLen); - charRefBufLen = 0; - } - } - } - - @Inline private void clearStrBufAfterUse() { - strBufLen = 0; - } - - @Inline private void clearStrBufBeforeUse() { - assert strBufLen == 0: "strBufLen not reset after previous use!"; - strBufLen = 0; // no-op in the absence of bugs - } - - @Inline private void clearStrBufAfterOneHyphen() { - assert strBufLen == 1: "strBufLen length not one!"; - assert strBuf[0] == '-': "strBuf does not start with a hyphen!"; - strBufLen = 0; - } - - /** - * Appends to the buffer. - * - * @param c - * the UTF-16 code unit to append - */ - @Inline private void appendStrBuf(char c) { - // CPPONLY: assert strBufLen < strBuf.length: "Previous buffer length insufficient."; - // CPPONLY: if (strBufLen == strBuf.length) { - // CPPONLY: if (!EnsureBufferSpace(1)) { - // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; - // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not - // CPPONLY: } - strBuf[strBufLen++] = c; - } - - /** - * The buffer as a String. Currently only used for error reporting. - * - * <p> - * C++ memory note: The return value must be released. - * - * @return the buffer as a string - */ - protected String strBufToString() { - String str = Portability.newStringFromBuffer(strBuf, 0, strBufLen - // CPPONLY: , tokenHandler - ); - clearStrBufAfterUse(); - return str; - } - - /** - * Returns the buffer as a local name. The return value is released in - * emitDoctypeToken(). - * - * @return the buffer as local name - */ - private void strBufToDoctypeName() { - doctypeName = Portability.newLocalNameFromBuffer(strBuf, 0, strBufLen, - interner); - clearStrBufAfterUse(); - } - - /** - * Emits the buffer as character tokens. - * - * @throws SAXException - * if the token handler threw - */ - private void emitStrBuf() throws SAXException { - if (strBufLen > 0) { - tokenHandler.characters(strBuf, 0, strBufLen); - clearStrBufAfterUse(); - } - } - - @Inline private void appendSecondHyphenToBogusComment() throws SAXException { - // [NOCPP[ - switch (commentPolicy) { - case ALTER_INFOSET: - appendStrBuf(' '); - // FALLTHROUGH - case ALLOW: - warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); - // ]NOCPP] - appendStrBuf('-'); - // [NOCPP[ - break; - case FATAL: - fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); - break; - } - // ]NOCPP] - } - - // [NOCPP[ - private void maybeAppendSpaceToBogusComment() throws SAXException { - switch (commentPolicy) { - case ALTER_INFOSET: - appendStrBuf(' '); - // FALLTHROUGH - case ALLOW: - warn("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); - break; - case FATAL: - fatal("The document is not mappable to XML 1.0 due to a trailing hyphen in a comment."); - break; - } - } - - // ]NOCPP] - - @Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c) - throws SAXException { - errConsecutiveHyphens(); - // [NOCPP[ - switch (commentPolicy) { - case ALTER_INFOSET: - strBufLen--; - // WARNING!!! This expands the worst case of the buffer length - // given the length of input! - appendStrBuf(' '); - appendStrBuf('-'); - // FALLTHROUGH - case ALLOW: - warn("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); - // ]NOCPP] - appendStrBuf(c); - // [NOCPP[ - break; - case FATAL: - fatal("The document is not mappable to XML 1.0 due to two consecutive hyphens in a comment."); - break; - } - // ]NOCPP] - } - - private void appendStrBuf(@NoLength char[] buffer, int offset, int length) { - int newLen = strBufLen + length; - // CPPONLY: assert newLen <= strBuf.length: "Previous buffer length insufficient."; - // CPPONLY: if (strBuf.length < newLen) { - // CPPONLY: if (!EnsureBufferSpace(length)) { - // CPPONLY: assert false: "RELEASE: Unable to recover from buffer reallocation failure"; - // CPPONLY: } // TODO: Add telemetry when outer if fires but inner does not - // CPPONLY: } - System.arraycopy(buffer, offset, strBuf, strBufLen, length); - strBufLen = newLen; - } - - /** - * Append the contents of the char reference buffer to the main one. - */ - @Inline private void appendCharRefBufToStrBuf() { - appendStrBuf(charRefBuf, 0, charRefBufLen); - charRefBufLen = 0; - } - - /** - * Emits the current comment token. - * - * @param pos - * TODO - * - * @throws SAXException - */ - private void emitComment(int provisionalHyphens, int pos) - throws SAXException { - // [NOCPP[ - if (wantsComments) { - // ]NOCPP] - tokenHandler.comment(strBuf, 0, strBufLen - - provisionalHyphens); - // [NOCPP[ - } - // ]NOCPP] - clearStrBufAfterUse(); - cstart = pos + 1; - } - - /** - * Flushes coalesced character tokens. - * - * @param buf - * TODO - * @param pos - * TODO - * - * @throws SAXException - */ - protected void flushChars(@NoLength char[] buf, int pos) - throws SAXException { - if (pos > cstart) { - tokenHandler.characters(buf, cstart, pos - cstart); - } - cstart = Integer.MAX_VALUE; - } - - /** - * Reports an condition that would make the infoset incompatible with XML - * 1.0 as fatal. - * - * @param message - * the message - * @throws SAXException - * @throws SAXParseException - */ - public void fatal(String message) throws SAXException { - SAXParseException spe = new SAXParseException(message, this); - if (errorHandler != null) { - errorHandler.fatalError(spe); - } - throw spe; - } - - /** - * Reports a Parse Error. - * - * @param message - * the message - * @throws SAXException - */ - public void err(String message) throws SAXException { - if (errorHandler == null) { - return; - } - SAXParseException spe = new SAXParseException(message, this); - errorHandler.error(spe); - } - - public void errTreeBuilder(String message) throws SAXException { - ErrorHandler eh = null; - if (tokenHandler instanceof TreeBuilder<?>) { - TreeBuilder<?> treeBuilder = (TreeBuilder<?>) tokenHandler; - eh = treeBuilder.getErrorHandler(); - } - if (eh == null) { - eh = errorHandler; - } - if (eh == null) { - return; - } - SAXParseException spe = new SAXParseException(message, this); - eh.error(spe); - } - - /** - * Reports a warning - * - * @param message - * the message - * @throws SAXException - */ - public void warn(String message) throws SAXException { - if (errorHandler == null) { - return; - } - SAXParseException spe = new SAXParseException(message, this); - errorHandler.warning(spe); - } - - private void strBufToElementNameString() { - tagName = ElementName.elementNameByBuffer(strBuf, 0, strBufLen, - interner); - clearStrBufAfterUse(); - } - - private int emitCurrentTagToken(boolean selfClosing, int pos) - throws SAXException { - cstart = pos + 1; - maybeErrSlashInEndTag(selfClosing); - stateSave = Tokenizer.DATA; - HtmlAttributes attrs = (attributes == null ? HtmlAttributes.EMPTY_ATTRIBUTES - : attributes); - if (endTag) { - /* - * When an end tag token is emitted, the content model flag must be - * switched to the PCDATA state. - */ - maybeErrAttributesOnEndTag(attrs); - // CPPONLY: if (!viewingXmlSource) { - tokenHandler.endTag(tagName); - // CPPONLY: } - // CPPONLY: if (newAttributesEachTime) { - // CPPONLY: Portability.delete(attributes); - // CPPONLY: attributes = null; - // CPPONLY: } - } else { - // CPPONLY: if (viewingXmlSource) { - // CPPONLY: assert newAttributesEachTime; - // CPPONLY: Portability.delete(attributes); - // CPPONLY: attributes = null; - // CPPONLY: } else { - tokenHandler.startTag(tagName, attrs, selfClosing); - // CPPONLY: } - } - tagName.release(); - tagName = null; - if (newAttributesEachTime) { - attributes = null; - } else { - attributes.clear(mappingLangToXmlLang); - } - /* - * The token handler may have called setStateAndEndTagExpectation - * and changed stateSave since the start of this method. - */ - return stateSave; - } - - private void attributeNameComplete() throws SAXException { - attributeName = AttributeName.nameByBuffer(strBuf, 0, strBufLen - // [NOCPP[ - , namePolicy != XmlViolationPolicy.ALLOW - // ]NOCPP] - , interner); - clearStrBufAfterUse(); - - if (attributes == null) { - attributes = new HtmlAttributes(mappingLangToXmlLang); - } - - /* - * When the user agent leaves the attribute name state (and before - * emitting the tag token, if appropriate), the complete attribute's - * name must be compared to the other attributes on the same token; if - * there is already an attribute on the token with the exact same name, - * then this is a parse error and the new attribute must be dropped, - * along with the value that gets associated with it (if any). - */ - if (attributes.contains(attributeName)) { - errDuplicateAttribute(); - attributeName.release(); - attributeName = null; - } - } - - private void addAttributeWithoutValue() throws SAXException { - noteAttributeWithoutValue(); - - // [NOCPP[ - if (metaBoundaryPassed && AttributeName.CHARSET == attributeName - && ElementName.META == tagName) { - err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); - } - // ]NOCPP] - if (attributeName != null) { - // [NOCPP[ - if (html4) { - if (attributeName.isBoolean()) { - if (html4ModeCompatibleWithXhtml1Schemata) { - attributes.addAttribute(attributeName, - attributeName.getLocal(AttributeName.HTML), - xmlnsPolicy); - } else { - attributes.addAttribute(attributeName, "", xmlnsPolicy); - } - } else { - if (AttributeName.BORDER != attributeName) { - err("Attribute value omitted for a non-boolean attribute. (HTML4-only error.)"); - attributes.addAttribute(attributeName, "", xmlnsPolicy); - } - } - } else { - if (AttributeName.SRC == attributeName - || AttributeName.HREF == attributeName) { - warn("Attribute \u201C" - + attributeName.getLocal(AttributeName.HTML) - + "\u201D without an explicit value seen. The attribute may be dropped by IE7."); - } - // ]NOCPP] - attributes.addAttribute(attributeName, - Portability.newEmptyString() - // [NOCPP[ - , xmlnsPolicy - // ]NOCPP] - // CPPONLY: , attributeLine - ); - // [NOCPP[ - } - // ]NOCPP] - attributeName = null; // attributeName has been adopted by the - // |attributes| object - } else { - clearStrBufAfterUse(); - } - } - - private void addAttributeWithValue() throws SAXException { - // [NOCPP[ - if (metaBoundaryPassed && ElementName.META == tagName - && AttributeName.CHARSET == attributeName) { - err("A \u201Ccharset\u201D attribute on a \u201Cmeta\u201D element found after the first 512 bytes."); - } - // ]NOCPP] - if (attributeName != null) { - String val = strBufToString(); // Ownership transferred to - // HtmlAttributes - // CPPONLY: if (mViewSource) { - // CPPONLY: mViewSource.MaybeLinkifyAttributeValue(attributeName, val); - // CPPONLY: } - // [NOCPP[ - if (!endTag && html4 && html4ModeCompatibleWithXhtml1Schemata - && attributeName.isCaseFolded()) { - val = newAsciiLowerCaseStringFromString(val); - } - // ]NOCPP] - attributes.addAttribute(attributeName, val - // [NOCPP[ - , xmlnsPolicy - // ]NOCPP] - // CPPONLY: , attributeLine - ); - attributeName = null; // attributeName has been adopted by the - // |attributes| object - } else { - // We have a duplicate attribute. Explicitly discard its value. - clearStrBufAfterUse(); - } - } - - // [NOCPP[ - - private static String newAsciiLowerCaseStringFromString(String str) { - if (str == null) { - return null; - } - char[] buf = new char[str.length()]; - for (int i = 0; i < str.length(); i++) { - char c = str.charAt(i); - if (c >= 'A' && c <= 'Z') { - c += 0x20; - } - buf[i] = c; - } - return new String(buf); - } - - protected void startErrorReporting() throws SAXException { - - } - - // ]NOCPP] - - public void start() throws SAXException { - initializeWithoutStarting(); - tokenHandler.startTokenization(this); - // [NOCPP[ - startErrorReporting(); - // ]NOCPP] - } - - public boolean tokenizeBuffer(UTF16Buffer buffer) throws SAXException { - int state = stateSave; - int returnState = returnStateSave; - char c = '\u0000'; - shouldSuspend = false; - lastCR = false; - - int start = buffer.getStart(); - int end = buffer.getEnd(); - - // In C++, the caller of tokenizeBuffer needs to do this explicitly. - // [NOCPP[ - ensureBufferSpace(end - start); - // ]NOCPP] - - /** - * The index of the last <code>char</code> read from <code>buf</code>. - */ - int pos = start - 1; - - /** - * The index of the first <code>char</code> in <code>buf</code> that is - * part of a coalesced run of character tokens or - * <code>Integer.MAX_VALUE</code> if there is not a current run being - * coalesced. - */ - switch (state) { - case DATA: - case RCDATA: - case SCRIPT_DATA: - case PLAINTEXT: - case RAWTEXT: - case CDATA_SECTION: - case SCRIPT_DATA_ESCAPED: - case SCRIPT_DATA_ESCAPE_START: - case SCRIPT_DATA_ESCAPE_START_DASH: - case SCRIPT_DATA_ESCAPED_DASH: - case SCRIPT_DATA_ESCAPED_DASH_DASH: - case SCRIPT_DATA_DOUBLE_ESCAPE_START: - case SCRIPT_DATA_DOUBLE_ESCAPED: - case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: - case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: - case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: - case SCRIPT_DATA_DOUBLE_ESCAPE_END: - cstart = start; - break; - default: - cstart = Integer.MAX_VALUE; - break; - } - - /** - * The number of <code>char</code>s in <code>buf</code> that have - * meaning. (The rest of the array is garbage and should not be - * examined.) - */ - // CPPONLY: if (mViewSource) { - // CPPONLY: mViewSource.SetBuffer(buffer); - // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); - // CPPONLY: mViewSource.DropBuffer((pos == buffer.getEnd()) ? pos : pos + 1); - // CPPONLY: } else { - // CPPONLY: pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, buffer.getEnd()); - // CPPONLY: } - // [NOCPP[ - pos = stateLoop(state, c, pos, buffer.getBuffer(), false, returnState, - end); - // ]NOCPP] - if (pos == end) { - // exiting due to end of buffer - buffer.setStart(pos); - } else { - buffer.setStart(pos + 1); - } - return lastCR; - } - - // [NOCPP[ - private void ensureBufferSpace(int inputLength) throws SAXException { - // Add 2 to account for emissions of LT_GT, LT_SOLIDUS and RSQB_RSQB. - // Adding to the general worst case instead of only the - // TreeBuilder-exposed worst case to avoid re-introducing a bug when - // unifying the tokenizer and tree builder buffers in the future. - int worstCase = strBufLen + inputLength + charRefBufLen + 2; - tokenHandler.ensureBufferSpace(worstCase); - if (commentPolicy == XmlViolationPolicy.ALTER_INFOSET) { - // When altering infoset, if the comment contents are consecutive - // hyphens, each hyphen generates a space, too. These buffer - // contents never get emitted as characters() to the tokenHandler, - // which is why this calculation happens after the call to - // ensureBufferSpace on tokenHandler. - worstCase *= 2; - } - if (strBuf == null) { - // Add an arbitrary small value to avoid immediate reallocation - // once there are a few characters in the buffer. - strBuf = new char[worstCase + 128]; - } else if (worstCase > strBuf.length) { - // HotSpot reportedly allocates memory with 8-byte accuracy, so - // there's no point in trying to do math here to avoid slop. - // Maybe we should add some small constant to worstCase here - // but not doing that without profiling. In C++ with jemalloc, - // the corresponding method should do math to round up here - // to avoid slop. - char[] newBuf = new char[worstCase]; - System.arraycopy(strBuf, 0, newBuf, 0, strBufLen); - strBuf = newBuf; - } - } - // ]NOCPP] - - @SuppressWarnings("unused") private int stateLoop(int state, char c, - int pos, @NoLength char[] buf, boolean reconsume, int returnState, - int endPos) throws SAXException { - /* - * Idioms used in this code: - * - * - * Consuming the next input character - * - * To consume the next input character, the code does this: if (++pos == - * endPos) { break stateloop; } c = checkChar(buf, pos); - * - * - * Staying in a state - * - * When there's a state that the tokenizer may stay in over multiple - * input characters, the state has a wrapper |for(;;)| loop and staying - * in the state continues the loop. - * - * - * Switching to another state - * - * To switch to another state, the code sets the state variable to the - * magic number of the new state. Then it either continues stateloop or - * breaks out of the state's own wrapper loop if the target state is - * right after the current state in source order. (This is a partial - * workaround for Java's lack of goto.) - * - * - * Reconsume support - * - * The spec sometimes says that an input character is reconsumed in - * another state. If a state can ever be entered so that an input - * character can be reconsumed in it, the state's code starts with an - * |if (reconsume)| that sets reconsume to false and skips over the - * normal code for consuming a new character. - * - * To reconsume the current character in another state, the code sets - * |reconsume| to true and then switches to the other state. - * - * - * Emitting character tokens - * - * This method emits character tokens lazily. Whenever a new range of - * character tokens starts, the field cstart must be set to the start - * index of the range. The flushChars() method must be called at the end - * of a range to flush it. - * - * - * U+0000 handling - * - * The various states have to handle the replacement of U+0000 with - * U+FFFD. However, if U+0000 would be reconsumed in another state, the - * replacement doesn't need to happen, because it's handled by the - * reconsuming state. - * - * - * LF handling - * - * Every state needs to increment the line number upon LF unless the LF - * gets reconsumed by another state which increments the line number. - * - * - * CR handling - * - * Every state needs to handle CR unless the CR gets reconsumed and is - * handled by the reconsuming state. The CR needs to be handled as if it - * were and LF, the lastCR field must be set to true and then this - * method must return. The IO driver will then swallow the next - * character if it is an LF to coalesce CRLF. - */ - stateloop: for (;;) { - switch (state) { - case DATA: - dataloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - switch (c) { - case '&': - /* - * U+0026 AMPERSAND (&) Switch to the character - * reference in data state. - */ - flushChars(buf, pos); - assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; - appendCharRefBuf(c); - setAdditionalAndRememberAmpersandLocation('\u0000'); - returnState = state; - state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); - continue stateloop; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Switch to the tag - * open state. - */ - flushChars(buf, pos); - - state = transition(state, Tokenizer.TAG_OPEN, reconsume, pos); - break dataloop; // FALL THROUGH continue - // stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the input character as a - * character token. - * - * Stay in the data state. - */ - continue; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case TAG_OPEN: - tagopenloop: for (;;) { - /* - * The behavior of this state depends on the content - * model flag. - */ - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * If the content model flag is set to the PCDATA state - * Consume the next input character: - */ - if (c >= 'A' && c <= 'Z') { - /* - * U+0041 LATIN CAPITAL LETTER A through to U+005A - * LATIN CAPITAL LETTER Z Create a new start tag - * token, - */ - endTag = false; - /* - * set its tag name to the lowercase version of the - * input character (add 0x0020 to the character's - * code point), - */ - clearStrBufBeforeUse(); - appendStrBuf((char) (c + 0x20)); - /* then switch to the tag name state. */ - state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); - /* - * (Don't emit the token yet; further details will - * be filled in before it is emitted.) - */ - break tagopenloop; - // continue stateloop; - } else if (c >= 'a' && c <= 'z') { - /* - * U+0061 LATIN SMALL LETTER A through to U+007A - * LATIN SMALL LETTER Z Create a new start tag - * token, - */ - endTag = false; - /* - * set its tag name to the input character, - */ - clearStrBufBeforeUse(); - appendStrBuf(c); - /* then switch to the tag name state. */ - state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); - /* - * (Don't emit the token yet; further details will - * be filled in before it is emitted.) - */ - break tagopenloop; - // continue stateloop; - } - switch (c) { - case '!': - /* - * U+0021 EXCLAMATION MARK (!) Switch to the - * markup declaration open state. - */ - state = transition(state, Tokenizer.MARKUP_DECLARATION_OPEN, reconsume, pos); - continue stateloop; - case '/': - /* - * U+002F SOLIDUS (/) Switch to the close tag - * open state. - */ - state = transition(state, Tokenizer.CLOSE_TAG_OPEN, reconsume, pos); - continue stateloop; - case '?': - // CPPONLY: if (viewingXmlSource) { - // CPPONLY: state = transition(state, - // CPPONLY: Tokenizer.PROCESSING_INSTRUCTION, - // CPPONLY: reconsume, - // CPPONLY: pos); - // CPPONLY: continue stateloop; - // CPPONLY: } - /* - * U+003F QUESTION MARK (?) Parse error. - */ - errProcessingInstruction(); - /* - * Switch to the bogus comment state. - */ - clearStrBufBeforeUse(); - appendStrBuf(c); - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Parse error. - */ - errLtGt(); - /* - * Emit a U+003C LESS-THAN SIGN character token - * and a U+003E GREATER-THAN SIGN character - * token. - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 2); - /* Switch to the data state. */ - cstart = pos + 1; - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - default: - /* - * Anything else Parse error. - */ - errBadCharAfterLt(c); - /* - * Emit a U+003C LESS-THAN SIGN character token - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - /* - * and reconsume the current input character in - * the data state. - */ - cstart = pos; - reconsume = true; - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - } - } - // FALL THROUGH DON'T REORDER - case TAG_NAME: - tagnameloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - strBufToElementNameString(); - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the before attribute name state. - */ - strBufToElementNameString(); - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - break tagnameloop; - // continue stateloop; - case '/': - /* - * U+002F SOLIDUS (/) Switch to the self-closing - * start tag state. - */ - strBufToElementNameString(); - state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * tag token. - */ - strBufToElementNameString(); - state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - /* - * Switch to the data state. - */ - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - if (c >= 'A' && c <= 'Z') { - /* - * U+0041 LATIN CAPITAL LETTER A through to - * U+005A LATIN CAPITAL LETTER Z Append the - * lowercase version of the current input - * character (add 0x0020 to the character's - * code point) to the current tag token's - * tag name. - */ - c += 0x20; - } - /* - * Anything else Append the current input - * character to the current tag token's tag - * name. - */ - appendStrBuf(c); - /* - * Stay in the tag name state. - */ - continue; - } - } - // FALLTHRU DON'T REORDER - case BEFORE_ATTRIBUTE_NAME: - beforeattributenameloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the before attribute name state. - */ - continue; - case '/': - /* - * U+002F SOLIDUS (/) Switch to the self-closing - * start tag state. - */ - state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * tag token. - */ - state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - /* - * Switch to the data state. - */ - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - case '\"': - case '\'': - case '<': - case '=': - /* - * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE - * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS - * SIGN (=) Parse error. - */ - errBadCharBeforeAttributeNameOrNull(c); - /* - * Treat it as per the "anything else" entry - * below. - */ - default: - /* - * Anything else Start a new attribute in the - * current tag token. - */ - if (c >= 'A' && c <= 'Z') { - /* - * U+0041 LATIN CAPITAL LETTER A through to - * U+005A LATIN CAPITAL LETTER Z Set that - * attribute's name to the lowercase version - * of the current input character (add - * 0x0020 to the character's code point) - */ - c += 0x20; - } - // CPPONLY: attributeLine = line; - /* - * Set that attribute's name to the current - * input character, - */ - clearStrBufBeforeUse(); - appendStrBuf(c); - /* - * and its value to the empty string. - */ - // Will do later. - /* - * Switch to the attribute name state. - */ - state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); - break beforeattributenameloop; - // continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case ATTRIBUTE_NAME: - attributenameloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - attributeNameComplete(); - state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the after attribute name state. - */ - attributeNameComplete(); - state = transition(state, Tokenizer.AFTER_ATTRIBUTE_NAME, reconsume, pos); - continue stateloop; - case '/': - /* - * U+002F SOLIDUS (/) Switch to the self-closing - * start tag state. - */ - attributeNameComplete(); - addAttributeWithoutValue(); - state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); - continue stateloop; - case '=': - /* - * U+003D EQUALS SIGN (=) Switch to the before - * attribute value state. - */ - attributeNameComplete(); - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); - break attributenameloop; - // continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * tag token. - */ - attributeNameComplete(); - addAttributeWithoutValue(); - state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - /* - * Switch to the data state. - */ - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - case '\"': - case '\'': - case '<': - /* - * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE - * (') U+003C LESS-THAN SIGN (<) Parse error. - */ - errQuoteOrLtInAttributeNameOrNull(c); - /* - * Treat it as per the "anything else" entry - * below. - */ - default: - if (c >= 'A' && c <= 'Z') { - /* - * U+0041 LATIN CAPITAL LETTER A through to - * U+005A LATIN CAPITAL LETTER Z Append the - * lowercase version of the current input - * character (add 0x0020 to the character's - * code point) to the current attribute's - * name. - */ - c += 0x20; - } - /* - * Anything else Append the current input - * character to the current attribute's name. - */ - appendStrBuf(c); - /* - * Stay in the attribute name state. - */ - continue; - } - } - // FALLTHRU DON'T REORDER - case BEFORE_ATTRIBUTE_VALUE: - beforeattributevalueloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the before attribute value state. - */ - continue; - case '"': - /* - * U+0022 QUOTATION MARK (") Switch to the - * attribute value (double-quoted) state. - */ - // CPPONLY: attributeLine = line; - clearStrBufBeforeUse(); - state = transition(state, Tokenizer.ATTRIBUTE_VALUE_DOUBLE_QUOTED, reconsume, pos); - break beforeattributevalueloop; - // continue stateloop; - case '&': - /* - * U+0026 AMPERSAND (&) Switch to the attribute - * value (unquoted) state and reconsume this - * input character. - */ - // CPPONLY: attributeLine = line; - clearStrBufBeforeUse(); - reconsume = true; - state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); - noteUnquotedAttributeValue(); - continue stateloop; - case '\'': - /* - * U+0027 APOSTROPHE (') Switch to the attribute - * value (single-quoted) state. - */ - // CPPONLY: attributeLine = line; - clearStrBufBeforeUse(); - state = transition(state, Tokenizer.ATTRIBUTE_VALUE_SINGLE_QUOTED, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Parse error. - */ - errAttributeValueMissing(); - /* - * Emit the current tag token. - */ - addAttributeWithoutValue(); - state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - /* - * Switch to the data state. - */ - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - case '<': - case '=': - case '`': - /* - * U+003C LESS-THAN SIGN (<) U+003D EQUALS SIGN - * (=) U+0060 GRAVE ACCENT (`) - */ - errLtOrEqualsOrGraveInUnquotedAttributeOrNull(c); - /* - * Treat it as per the "anything else" entry - * below. - */ - default: - // [NOCPP[ - errHtml4NonNameInUnquotedAttribute(c); - // ]NOCPP] - /* - * Anything else Append the current input - * character to the current attribute's value. - */ - // CPPONLY: attributeLine = line; - clearStrBufBeforeUse(); - appendStrBuf(c); - /* - * Switch to the attribute value (unquoted) - * state. - */ - - state = transition(state, Tokenizer.ATTRIBUTE_VALUE_UNQUOTED, reconsume, pos); - noteUnquotedAttributeValue(); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case ATTRIBUTE_VALUE_DOUBLE_QUOTED: - attributevaluedoublequotedloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '"': - /* - * U+0022 QUOTATION MARK (") Switch to the after - * attribute value (quoted) state. - */ - addAttributeWithValue(); - - state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); - break attributevaluedoublequotedloop; - // continue stateloop; - case '&': - /* - * U+0026 AMPERSAND (&) Switch to the character - * reference in attribute value state, with the - * additional allowed character being U+0022 - * QUOTATION MARK ("). - */ - assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; - appendCharRefBuf(c); - setAdditionalAndRememberAmpersandLocation('\"'); - returnState = state; - state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append the current input - * character to the current attribute's value. - */ - appendStrBuf(c); - /* - * Stay in the attribute value (double-quoted) - * state. - */ - continue; - } - } - // FALLTHRU DON'T REORDER - case AFTER_ATTRIBUTE_VALUE_QUOTED: - afterattributevaluequotedloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the before attribute name state. - */ - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - continue stateloop; - case '/': - /* - * U+002F SOLIDUS (/) Switch to the self-closing - * start tag state. - */ - state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); - break afterattributevaluequotedloop; - // continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * tag token. - */ - state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - /* - * Switch to the data state. - */ - continue stateloop; - default: - /* - * Anything else Parse error. - */ - errNoSpaceBetweenAttributes(); - /* - * Reconsume the character in the before - * attribute name state. - */ - reconsume = true; - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case SELF_CLOSING_START_TAG: - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Set the self-closing - * flag of the current tag token. Emit the current - * tag token. - */ - // [NOCPP[ - errHtml4XmlVoidSyntax(); - // ]NOCPP] - state = transition(state, emitCurrentTagToken(true, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - /* - * Switch to the data state. - */ - continue stateloop; - default: - /* Anything else Parse error. */ - errSlashNotFollowedByGt(); - /* - * Reconsume the character in the before attribute - * name state. - */ - reconsume = true; - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - continue stateloop; - } - // XXX reorder point - case ATTRIBUTE_VALUE_UNQUOTED: - for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - addAttributeWithValue(); - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the before attribute name state. - */ - addAttributeWithValue(); - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - continue stateloop; - case '&': - /* - * U+0026 AMPERSAND (&) Switch to the character - * reference in attribute value state, with the - * additional allowed character being U+003E - * GREATER-THAN SIGN (>) - */ - assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; - appendCharRefBuf(c); - setAdditionalAndRememberAmpersandLocation('>'); - returnState = state; - state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * tag token. - */ - addAttributeWithValue(); - state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - /* - * Switch to the data state. - */ - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - case '<': - case '\"': - case '\'': - case '=': - case '`': - /* - * U+0022 QUOTATION MARK (") U+0027 APOSTROPHE - * (') U+003C LESS-THAN SIGN (<) U+003D EQUALS - * SIGN (=) U+0060 GRAVE ACCENT (`) Parse error. - */ - errUnquotedAttributeValOrNull(c); - /* - * Treat it as per the "anything else" entry - * below. - */ - // fall through - default: - // [NOCPP] - errHtml4NonNameInUnquotedAttribute(c); - // ]NOCPP] - /* - * Anything else Append the current input - * character to the current attribute's value. - */ - appendStrBuf(c); - /* - * Stay in the attribute value (unquoted) state. - */ - continue; - } - } - // XXX reorder point - case AFTER_ATTRIBUTE_NAME: - for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the after attribute name state. - */ - continue; - case '/': - /* - * U+002F SOLIDUS (/) Switch to the self-closing - * start tag state. - */ - addAttributeWithoutValue(); - state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); - continue stateloop; - case '=': - /* - * U+003D EQUALS SIGN (=) Switch to the before - * attribute value state. - */ - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_VALUE, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * tag token. - */ - addAttributeWithoutValue(); - state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - /* - * Switch to the data state. - */ - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - case '\"': - case '\'': - case '<': - errQuoteOrLtInAttributeNameOrNull(c); - /* - * Treat it as per the "anything else" entry - * below. - */ - default: - addAttributeWithoutValue(); - /* - * Anything else Start a new attribute in the - * current tag token. - */ - if (c >= 'A' && c <= 'Z') { - /* - * U+0041 LATIN CAPITAL LETTER A through to - * U+005A LATIN CAPITAL LETTER Z Set that - * attribute's name to the lowercase version - * of the current input character (add - * 0x0020 to the character's code point) - */ - c += 0x20; - } - /* - * Set that attribute's name to the current - * input character, - */ - clearStrBufBeforeUse(); - appendStrBuf(c); - /* - * and its value to the empty string. - */ - // Will do later. - /* - * Switch to the attribute name state. - */ - state = transition(state, Tokenizer.ATTRIBUTE_NAME, reconsume, pos); - continue stateloop; - } - } - // XXX reorder point - case MARKUP_DECLARATION_OPEN: - markupdeclarationopenloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * If the next two characters are both U+002D - * HYPHEN-MINUS characters (-), consume those two - * characters, create a comment token whose data is the - * empty string, and switch to the comment start state. - * - * Otherwise, if the next seven characters are an ASCII - * case-insensitive match for the word "DOCTYPE", then - * consume those characters and switch to the DOCTYPE - * state. - * - * Otherwise, if the insertion mode is - * "in foreign content" and the current node is not an - * element in the HTML namespace and the next seven - * characters are an case-sensitive match for the string - * "[CDATA[" (the five uppercase letters "CDATA" with a - * U+005B LEFT SQUARE BRACKET character before and - * after), then consume those characters and switch to - * the CDATA section state. - * - * Otherwise, is is a parse error. Switch to the bogus - * comment state. The next character that is consumed, - * if any, is the first character that will be in the - * comment. - */ - switch (c) { - case '-': - clearStrBufBeforeUse(); - appendStrBuf(c); - state = transition(state, Tokenizer.MARKUP_DECLARATION_HYPHEN, reconsume, pos); - break markupdeclarationopenloop; - // continue stateloop; - case 'd': - case 'D': - clearStrBufBeforeUse(); - appendStrBuf(c); - index = 0; - state = transition(state, Tokenizer.MARKUP_DECLARATION_OCTYPE, reconsume, pos); - continue stateloop; - case '[': - if (tokenHandler.cdataSectionAllowed()) { - clearStrBufBeforeUse(); - appendStrBuf(c); - index = 0; - state = transition(state, Tokenizer.CDATA_START, reconsume, pos); - continue stateloop; - } - // else fall through - default: - errBogusComment(); - clearStrBufBeforeUse(); - reconsume = true; - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case MARKUP_DECLARATION_HYPHEN: - markupdeclarationhyphenloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case '\u0000': - break stateloop; - case '-': - clearStrBufAfterOneHyphen(); - state = transition(state, Tokenizer.COMMENT_START, reconsume, pos); - break markupdeclarationhyphenloop; - // continue stateloop; - default: - errBogusComment(); - reconsume = true; - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case COMMENT_START: - commentstartloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Comment start state - * - * - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Switch to the comment - * start dash state. - */ - appendStrBuf(c); - state = transition(state, Tokenizer.COMMENT_START_DASH, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Parse error. - */ - errPrematureEndOfComment(); - /* Emit the comment token. */ - emitComment(0, pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - break stateloop; - case '\n': - appendStrBufLineFeed(); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - break commentstartloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append the input character to - * the comment token's data. - */ - appendStrBuf(c); - /* - * Switch to the comment state. - */ - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - break commentstartloop; - // continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case COMMENT: - commentloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Comment state Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Switch to the comment - * end dash state - */ - appendStrBuf(c); - state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); - break commentloop; - // continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append the input character to - * the comment token's data. - */ - appendStrBuf(c); - /* - * Stay in the comment state. - */ - continue; - } - } - // FALLTHRU DON'T REORDER - case COMMENT_END_DASH: - commentenddashloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Comment end dash state Consume the next input - * character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Switch to the comment - * end state - */ - appendStrBuf(c); - state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); - break commentenddashloop; - // continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - break stateloop; - case '\n': - appendStrBufLineFeed(); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append a U+002D HYPHEN-MINUS - * (-) character and the input character to the - * comment token's data. - */ - appendStrBuf(c); - /* - * Switch to the comment state. - */ - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case COMMENT_END: - commentendloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Comment end dash state Consume the next input - * character: - */ - switch (c) { - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the comment - * token. - */ - emitComment(2, pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '-': - /* U+002D HYPHEN-MINUS (-) Parse error. */ - /* - * Append a U+002D HYPHEN-MINUS (-) character to - * the comment token's data. - */ - adjustDoubleHyphenAndAppendToStrBufAndErr(c); - /* - * Stay in the comment end state. - */ - continue; - case '\r': - adjustDoubleHyphenAndAppendToStrBufCarriageReturn(); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - break stateloop; - case '\n': - adjustDoubleHyphenAndAppendToStrBufLineFeed(); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue stateloop; - case '!': - errHyphenHyphenBang(); - appendStrBuf(c); - state = transition(state, Tokenizer.COMMENT_END_BANG, reconsume, pos); - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Append two U+002D HYPHEN-MINUS (-) characters - * and the input character to the comment - * token's data. - */ - adjustDoubleHyphenAndAppendToStrBufAndErr(c); - /* - * Switch to the comment state. - */ - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue stateloop; - } - } - // XXX reorder point - case COMMENT_END_BANG: - for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Comment end bang state - * - * Consume the next input character: - */ - switch (c) { - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the comment - * token. - */ - emitComment(3, pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '-': - /* - * Append two U+002D HYPHEN-MINUS (-) characters - * and a U+0021 EXCLAMATION MARK (!) character - * to the comment token's data. - */ - appendStrBuf(c); - /* - * Switch to the comment end dash state. - */ - state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append two U+002D HYPHEN-MINUS - * (-) characters, a U+0021 EXCLAMATION MARK (!) - * character, and the input character to the - * comment token's data. Switch to the comment - * state. - */ - appendStrBuf(c); - /* - * Switch to the comment state. - */ - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue stateloop; - } - } - // XXX reorder point - case COMMENT_START_DASH: - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Comment start dash state - * - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Switch to the comment end - * state - */ - appendStrBuf(c); - state = transition(state, Tokenizer.COMMENT_END, reconsume, pos); - continue stateloop; - case '>': - errPrematureEndOfComment(); - /* Emit the comment token. */ - emitComment(1, pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - break stateloop; - case '\n': - appendStrBufLineFeed(); - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Append a U+002D HYPHEN-MINUS character (-) and - * the current input character to the comment - * token's data. - */ - appendStrBuf(c); - /* - * Switch to the comment state. - */ - state = transition(state, Tokenizer.COMMENT, reconsume, pos); - continue stateloop; - } - // XXX reorder point - case CDATA_START: - for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - if (index < 6) { // CDATA_LSQB.length - if (c == Tokenizer.CDATA_LSQB[index]) { - appendStrBuf(c); - } else { - errBogusComment(); - reconsume = true; - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - } - index++; - continue; - } else { - clearStrBufAfterUse(); - cstart = pos; // start coalescing - reconsume = true; - state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); - break; // FALL THROUGH continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case CDATA_SECTION: - cdatasectionloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - switch (c) { - case ']': - flushChars(buf, pos); - state = transition(state, Tokenizer.CDATA_RSQB, reconsume, pos); - break cdatasectionloop; // FALL THROUGH - case '\u0000': - emitReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - default: - continue; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case CDATA_RSQB: - cdatarsqb: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case ']': - state = transition(state, Tokenizer.CDATA_RSQB_RSQB, reconsume, pos); - break cdatarsqb; - default: - tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, - 1); - cstart = pos; - reconsume = true; - state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case CDATA_RSQB_RSQB: - cdatarsqbrsqb: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case ']': - // Saw a third ]. Emit one ] (logically the - // first one) and stay in this state to - // remember that the last two characters seen - // have been ]]. - tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); - continue; - case '>': - cstart = pos + 1; - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - default: - tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); - cstart = pos; - reconsume = true; - state = transition(state, Tokenizer.CDATA_SECTION, reconsume, pos); - continue stateloop; - } - } - // XXX reorder point - case ATTRIBUTE_VALUE_SINGLE_QUOTED: - attributevaluesinglequotedloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '\'': - /* - * U+0027 APOSTROPHE (') Switch to the after - * attribute value (quoted) state. - */ - addAttributeWithValue(); - - state = transition(state, Tokenizer.AFTER_ATTRIBUTE_VALUE_QUOTED, reconsume, pos); - continue stateloop; - case '&': - /* - * U+0026 AMPERSAND (&) Switch to the character - * reference in attribute value state, with the - * + additional allowed character being U+0027 - * APOSTROPHE ('). - */ - assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; - appendCharRefBuf(c); - setAdditionalAndRememberAmpersandLocation('\''); - returnState = state; - state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); - break attributevaluesinglequotedloop; - // continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append the current input - * character to the current attribute's value. - */ - appendStrBuf(c); - /* - * Stay in the attribute value (double-quoted) - * state. - */ - continue; - } - } - // FALLTHRU DON'T REORDER - case CONSUME_CHARACTER_REFERENCE: - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - if (c == '\u0000') { - break stateloop; - } - /* - * Unlike the definition is the spec, this state does not - * return a value and never requires the caller to - * backtrack. This state takes care of emitting characters - * or appending to the current attribute value. It also - * takes care of that in the case when consuming the - * character reference fails. - */ - /* - * This section defines how to consume a character - * reference. This definition is used when parsing character - * references in text and in attributes. - * - * The behavior depends on the identity of the next - * character (the one immediately after the U+0026 AMPERSAND - * character): - */ - switch (c) { - case ' ': - case '\t': - case '\n': - case '\r': // we'll reconsume! - case '\u000C': - case '<': - case '&': - emitOrAppendCharRefBuf(returnState); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; - } - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - case '#': - /* - * U+0023 NUMBER SIGN (#) Consume the U+0023 NUMBER - * SIGN. - */ - appendCharRefBuf('#'); - state = transition(state, Tokenizer.CONSUME_NCR, reconsume, pos); - continue stateloop; - default: - if (c == additional) { - emitOrAppendCharRefBuf(returnState); - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - if (c >= 'a' && c <= 'z') { - firstCharKey = c - 'a' + 26; - } else if (c >= 'A' && c <= 'Z') { - firstCharKey = c - 'A'; - } else { - // No match - /* - * If no match can be made, then this is a parse - * error. - */ - errNoNamedCharacterMatch(); - emitOrAppendCharRefBuf(returnState); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; - } - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - // Didn't fail yet - appendCharRefBuf(c); - state = transition(state, Tokenizer.CHARACTER_REFERENCE_HILO_LOOKUP, reconsume, pos); - // FALL THROUGH continue stateloop; - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case CHARACTER_REFERENCE_HILO_LOOKUP: - { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - if (c == '\u0000') { - break stateloop; - } - /* - * The data structure is as follows: - * - * HILO_ACCEL is a two-dimensional int array whose major - * index corresponds to the second character of the - * character reference (code point as index) and the - * minor index corresponds to the first character of the - * character reference (packed so that A-Z runs from 0 - * to 25 and a-z runs from 26 to 51). This layout makes - * it easier to use the sparseness of the data structure - * to omit parts of it: The second dimension of the - * table is null when no character reference starts with - * the character corresponding to that row. - * - * The int value HILO_ACCEL (by these indeces) is zero - * if there exists no character reference starting with - * that two-letter prefix. Otherwise, the value is an - * int that packs two shorts so that the higher short is - * the index of the highest character reference name - * with that prefix in NAMES and the lower short - * corresponds to the index of the lowest character - * reference name with that prefix. (It happens that the - * first two character reference names share their - * prefix so the packed int cannot be 0 by packing the - * two shorts.) - * - * NAMES is an array of byte arrays where each byte - * array encodes the name of a character references as - * ASCII. The names omit the first two letters of the - * name. (Since storing the first two letters would be - * redundant with the data contained in HILO_ACCEL.) The - * entries are lexically sorted. - * - * For a given index in NAMES, the same index in VALUES - * contains the corresponding expansion as an array of - * two UTF-16 code units (either the character and - * U+0000 or a suggogate pair). - */ - int hilo = 0; - if (c <= 'z') { - @Const @NoLength int[] row = NamedCharactersAccel.HILO_ACCEL[c]; - if (row != null) { - hilo = row[firstCharKey]; - } - } - if (hilo == 0) { - /* - * If no match can be made, then this is a parse - * error. - */ - errNoNamedCharacterMatch(); - emitOrAppendCharRefBuf(returnState); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; - } - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - // Didn't fail yet - appendCharRefBuf(c); - lo = hilo & 0xFFFF; - hi = hilo >> 16; - entCol = -1; - candidate = -1; - charRefBufMark = 0; - state = transition(state, Tokenizer.CHARACTER_REFERENCE_TAIL, reconsume, pos); - // FALL THROUGH continue stateloop; - } - case CHARACTER_REFERENCE_TAIL: - outer: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - if (c == '\u0000') { - break stateloop; - } - entCol++; - /* - * Consume the maximum number of characters possible, - * with the consumed characters matching one of the - * identifiers in the first column of the named - * character references table (in a case-sensitive - * manner). - */ - loloop: for (;;) { - if (hi < lo) { - break outer; - } - if (entCol == NamedCharacters.NAMES[lo].length()) { - candidate = lo; - charRefBufMark = charRefBufLen; - lo++; - } else if (entCol > NamedCharacters.NAMES[lo].length()) { - break outer; - } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { - lo++; - } else { - break loloop; - } - } - - hiloop: for (;;) { - if (hi < lo) { - break outer; - } - if (entCol == NamedCharacters.NAMES[hi].length()) { - break hiloop; - } - if (entCol > NamedCharacters.NAMES[hi].length()) { - break outer; - } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { - hi--; - } else { - break hiloop; - } - } - - if (c == ';') { - // If we see a semicolon, there cannot be a - // longer match. Break the loop. However, before - // breaking, take the longest match so far as the - // candidate, if we are just about to complete a - // match. - if (entCol + 1 == NamedCharacters.NAMES[lo].length()) { - candidate = lo; - charRefBufMark = charRefBufLen; - } - break outer; - } - - if (hi < lo) { - break outer; - } - appendCharRefBuf(c); - continue; - } - - if (candidate == -1) { - // reconsume deals with CR, LF or nul - /* - * If no match can be made, then this is a parse error. - */ - errNoNamedCharacterMatch(); - emitOrAppendCharRefBuf(returnState); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; - } - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } else { - // c can't be CR, LF or nul if we got here - @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; - if (candidateName.length() == 0 - || candidateName.charAt(candidateName.length() - 1) != ';') { - /* - * If the last character matched is not a U+003B - * SEMICOLON (;), there is a parse error. - */ - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - /* - * If the entity is being consumed as part of an - * attribute, and the last character matched is - * not a U+003B SEMICOLON (;), - */ - char ch; - if (charRefBufMark == charRefBufLen) { - ch = c; - } else { - ch = charRefBuf[charRefBufMark]; - } - if (ch == '=' || (ch >= '0' && ch <= '9') - || (ch >= 'A' && ch <= 'Z') - || (ch >= 'a' && ch <= 'z')) { - /* - * and the next character is either a U+003D - * EQUALS SIGN character (=) or in the range - * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, - * U+0041 LATIN CAPITAL LETTER A to U+005A - * LATIN CAPITAL LETTER Z, or U+0061 LATIN - * SMALL LETTER A to U+007A LATIN SMALL - * LETTER Z, then, for historical reasons, - * all the characters that were matched - * after the U+0026 AMPERSAND (&) must be - * unconsumed, and nothing is returned. - */ - errNoNamedCharacterMatch(); - appendCharRefBufToStrBuf(); - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - } - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - errUnescapedAmpersandInterpretedAsCharacterReference(); - } else { - errNotSemicolonTerminated(); - } - } - - /* - * Otherwise, return a character token for the character - * corresponding to the entity name (as given by the - * second column of the named character references - * table). - */ - // CPPONLY: completedNamedCharacterReference(); - @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; - if ( - // [NOCPP[ - val.length == 1 - // ]NOCPP] - // CPPONLY: val[1] == 0 - ) { - emitOrAppendOne(val, returnState); - } else { - emitOrAppendTwo(val, returnState); - } - // this is so complicated! - if (charRefBufMark < charRefBufLen) { - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - appendStrBuf(charRefBuf, charRefBufMark, - charRefBufLen - charRefBufMark); - } else { - tokenHandler.characters(charRefBuf, charRefBufMark, - charRefBufLen - charRefBufMark); - } - } - // charRefBufLen will be zeroed below! - - // Check if we broke out early with c being the last - // character that matched as opposed to being the - // first one that didn't match. In the case of an - // early break, the next run on text should start - // *after* the current character and the current - // character shouldn't be reconsumed. - boolean earlyBreak = (c == ';' && charRefBufMark == charRefBufLen); - charRefBufLen = 0; - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = earlyBreak ? pos + 1 : pos; - } - reconsume = !earlyBreak; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - /* - * If the markup contains I'm ¬it; I tell you, the - * entity is parsed as "not", as in, I'm ¬it; I tell - * you. But if the markup was I'm ∉ I tell you, - * the entity would be parsed as "notin;", resulting in - * I'm ∉ I tell you. - */ - } - // XXX reorder point - case CONSUME_NCR: - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - value = 0; - seenDigits = false; - /* - * The behavior further depends on the character after the - * U+0023 NUMBER SIGN: - */ - switch (c) { - case 'x': - case 'X': - - /* - * U+0078 LATIN SMALL LETTER X U+0058 LATIN CAPITAL - * LETTER X Consume the X. - * - * Follow the steps below, but using the range of - * characters U+0030 DIGIT ZERO through to U+0039 - * DIGIT NINE, U+0061 LATIN SMALL LETTER A through - * to U+0066 LATIN SMALL LETTER F, and U+0041 LATIN - * CAPITAL LETTER A, through to U+0046 LATIN CAPITAL - * LETTER F (in other words, 0-9, A-F, a-f). - * - * When it comes to interpreting the number, - * interpret it as a hexadecimal number. - */ - appendCharRefBuf(c); - state = transition(state, Tokenizer.HEX_NCR_LOOP, reconsume, pos); - continue stateloop; - default: - /* - * Anything else Follow the steps below, but using - * the range of characters U+0030 DIGIT ZERO through - * to U+0039 DIGIT NINE (i.e. just 0-9). - * - * When it comes to interpreting the number, - * interpret it as a decimal number. - */ - reconsume = true; - state = transition(state, Tokenizer.DECIMAL_NRC_LOOP, reconsume, pos); - // FALL THROUGH continue stateloop; - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case DECIMAL_NRC_LOOP: - decimalloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume as many characters as match the range of - * characters given above. - */ - assert value >= 0: "value must not become negative."; - if (c >= '0' && c <= '9') { - seenDigits = true; - // Avoid overflow - if (value <= 0x10FFFF) { - value *= 10; - value += c - '0'; - } - continue; - } else if (c == ';') { - if (seenDigits) { - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos + 1; - } - state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); - // FALL THROUGH continue stateloop; - break decimalloop; - } else { - errNoDigitsInNCR(); - appendCharRefBuf(';'); - emitOrAppendCharRefBuf(returnState); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos + 1; - } - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - } else { - /* - * If no characters match the range, then don't - * consume any characters (and unconsume the U+0023 - * NUMBER SIGN character and, if appropriate, the X - * character). This is a parse error; nothing is - * returned. - * - * Otherwise, if the next character is a U+003B - * SEMICOLON, consume that too. If it isn't, there - * is a parse error. - */ - if (!seenDigits) { - errNoDigitsInNCR(); - emitOrAppendCharRefBuf(returnState); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; - } - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } else { - errCharRefLacksSemicolon(); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; - } - reconsume = true; - state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); - // FALL THROUGH continue stateloop; - break decimalloop; - } - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case HANDLE_NCR_VALUE: - // WARNING previous state sets reconsume - // We are not going to emit the contents of charRefBuf. - charRefBufLen = 0; - // XXX inline this case if the method size can take it - handleNcrValue(returnState); - state = transition(state, returnState, reconsume, pos); - continue stateloop; - // XXX reorder point - case HEX_NCR_LOOP: - for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume as many characters as match the range of - * characters given above. - */ - assert value >= 0: "value must not become negative."; - if (c >= '0' && c <= '9') { - seenDigits = true; - // Avoid overflow - if (value <= 0x10FFFF) { - value *= 16; - value += c - '0'; - } - continue; - } else if (c >= 'A' && c <= 'F') { - seenDigits = true; - // Avoid overflow - if (value <= 0x10FFFF) { - value *= 16; - value += c - 'A' + 10; - } - continue; - } else if (c >= 'a' && c <= 'f') { - seenDigits = true; - // Avoid overflow - if (value <= 0x10FFFF) { - value *= 16; - value += c - 'a' + 10; - } - continue; - } else if (c == ';') { - if (seenDigits) { - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos + 1; - } - state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); - continue stateloop; - } else { - errNoDigitsInNCR(); - appendCharRefBuf(';'); - emitOrAppendCharRefBuf(returnState); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos + 1; - } - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - } else { - /* - * If no characters match the range, then don't - * consume any characters (and unconsume the U+0023 - * NUMBER SIGN character and, if appropriate, the X - * character). This is a parse error; nothing is - * returned. - * - * Otherwise, if the next character is a U+003B - * SEMICOLON, consume that too. If it isn't, there - * is a parse error. - */ - if (!seenDigits) { - errNoDigitsInNCR(); - emitOrAppendCharRefBuf(returnState); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; - } - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } else { - errCharRefLacksSemicolon(); - if ((returnState & DATA_AND_RCDATA_MASK) == 0) { - cstart = pos; - } - reconsume = true; - state = transition(state, Tokenizer.HANDLE_NCR_VALUE, reconsume, pos); - continue stateloop; - } - } - } - // XXX reorder point - case PLAINTEXT: - plaintextloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - switch (c) { - case '\u0000': - emitPlaintextReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the current input - * character as a character token. Stay in the - * RAWTEXT state. - */ - continue; - } - } - // XXX reorder point - case CLOSE_TAG_OPEN: - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Otherwise, if the content model flag is set to the PCDATA - * state, or if the next few characters do match that tag - * name, consume the next input character: - */ - switch (c) { - case '>': - /* U+003E GREATER-THAN SIGN (>) Parse error. */ - errLtSlashGt(); - /* - * Switch to the data state. - */ - cstart = pos + 1; - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\r': - silentCarriageReturn(); - /* Anything else Parse error. */ - errGarbageAfterLtSlash(); - /* - * Switch to the bogus comment state. - */ - clearStrBufBeforeUse(); - appendStrBuf('\n'); - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - /* Anything else Parse error. */ - errGarbageAfterLtSlash(); - /* - * Switch to the bogus comment state. - */ - clearStrBufBeforeUse(); - appendStrBuf(c); - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - if (c >= 'A' && c <= 'Z') { - c += 0x20; - } - if (c >= 'a' && c <= 'z') { - /* - * U+0061 LATIN SMALL LETTER A through to U+007A - * LATIN SMALL LETTER Z Create a new end tag - * token, - */ - endTag = true; - /* - * set its tag name to the input character, - */ - clearStrBufBeforeUse(); - appendStrBuf(c); - /* - * then switch to the tag name state. (Don't - * emit the token yet; further details will be - * filled in before it is emitted.) - */ - state = transition(state, Tokenizer.TAG_NAME, reconsume, pos); - continue stateloop; - } else { - /* Anything else Parse error. */ - errGarbageAfterLtSlash(); - /* - * Switch to the bogus comment state. - */ - clearStrBufBeforeUse(); - appendStrBuf(c); - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - } - } - // XXX reorder point - case RCDATA: - rcdataloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - switch (c) { - case '&': - /* - * U+0026 AMPERSAND (&) Switch to the character - * reference in RCDATA state. - */ - flushChars(buf, pos); - assert charRefBufLen == 0: "charRefBufLen not reset after previous use!"; - appendCharRefBuf(c); - setAdditionalAndRememberAmpersandLocation('\u0000'); - returnState = state; - state = transition(state, Tokenizer.CONSUME_CHARACTER_REFERENCE, reconsume, pos); - continue stateloop; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Switch to the - * RCDATA less-than sign state. - */ - flushChars(buf, pos); - - returnState = state; - state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); - continue stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Emit the current input character as a - * character token. Stay in the RCDATA state. - */ - continue; - } - } - // XXX reorder point - case RAWTEXT: - rawtextloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - switch (c) { - case '<': - /* - * U+003C LESS-THAN SIGN (<) Switch to the - * RAWTEXT less-than sign state. - */ - flushChars(buf, pos); - - returnState = state; - state = transition(state, Tokenizer.RAWTEXT_RCDATA_LESS_THAN_SIGN, reconsume, pos); - break rawtextloop; - // FALL THRU continue stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Emit the current input character as a - * character token. Stay in the RAWTEXT state. - */ - continue; - } - } - // XXX fallthru don't reorder - case RAWTEXT_RCDATA_LESS_THAN_SIGN: - rawtextrcdatalessthansignloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case '/': - /* - * U+002F SOLIDUS (/) Set the temporary buffer - * to the empty string. Switch to the script - * data end tag open state. - */ - index = 0; - clearStrBufBeforeUse(); - state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); - break rawtextrcdatalessthansignloop; - // FALL THRU continue stateloop; - default: - /* - * Otherwise, emit a U+003C LESS-THAN SIGN - * character token - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - /* - * and reconsume the current input character in - * the data state. - */ - cstart = pos; - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - } - // XXX fall thru. don't reorder. - case NON_DATA_END_TAG_NAME: - for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * ASSERT! when entering this state, set index to 0 and - * call clearStrBufBeforeUse() assert (contentModelElement != - * null); Let's implement the above without lookahead. - * strBuf is the 'temporary buffer'. - */ - if (index < endTagExpectationAsArray.length) { - char e = endTagExpectationAsArray[index]; - char folded = c; - if (c >= 'A' && c <= 'Z') { - folded += 0x20; - } - if (folded != e) { - // [NOCPP[ - errHtml4LtSlashInRcdata(folded); - // ]NOCPP] - tokenHandler.characters(Tokenizer.LT_SOLIDUS, - 0, 2); - emitStrBuf(); - cstart = pos; - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - appendStrBuf(c); - index++; - continue; - } else { - endTag = true; - // XXX replace contentModelElement with different - // type - tagName = endTagExpectation; - switch (c) { - case '\r': - silentCarriageReturn(); - clearStrBufAfterUse(); // strBuf not used - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE - * FEED (LF) U+000C FORM FEED (FF) U+0020 - * SPACE If the current end tag token is an - * appropriate end tag token, then switch to - * the before attribute name state. - */ - clearStrBufAfterUse(); // strBuf not used - state = transition(state, Tokenizer.BEFORE_ATTRIBUTE_NAME, reconsume, pos); - continue stateloop; - case '/': - /* - * U+002F SOLIDUS (/) If the current end tag - * token is an appropriate end tag token, - * then switch to the self-closing start tag - * state. - */ - clearStrBufAfterUse(); // strBuf not used - state = transition(state, Tokenizer.SELF_CLOSING_START_TAG, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) If the - * current end tag token is an appropriate - * end tag token, then emit the current tag - * token and switch to the data state. - */ - clearStrBufAfterUse(); // strBuf not used - state = transition(state, emitCurrentTagToken(false, pos), reconsume, pos); - if (shouldSuspend) { - break stateloop; - } - continue stateloop; - default: - /* - * Emit a U+003C LESS-THAN SIGN character - * token, a U+002F SOLIDUS character token, - * a character token for each of the - * characters in the temporary buffer (in - * the order they were added to the buffer), - * and reconsume the current input character - * in the RAWTEXT state. - */ - // [NOCPP[ - errWarnLtSlashInRcdata(); - // ]NOCPP] - tokenHandler.characters( - Tokenizer.LT_SOLIDUS, 0, 2); - emitStrBuf(); - cstart = pos; // don't drop the - // character - reconsume = true; - state = transition(state, returnState, reconsume, pos); - continue stateloop; - } - } - } - // XXX reorder point - // BEGIN HOTSPOT WORKAROUND - case BOGUS_COMMENT: - boguscommentloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume every character up to and including the first - * U+003E GREATER-THAN SIGN character (>) or the end of - * the file (EOF), whichever comes first. Emit a comment - * token whose data is the concatenation of all the - * characters starting from and including the character - * that caused the state machine to switch into the - * bogus comment state, up to and including the - * character immediately before the last consumed - * character (i.e. up to the character just before the - * U+003E or EOF character). (If the comment was started - * by the end of the file (EOF), the token is empty.) - * - * Switch to the data state. - * - * If the end of the file was reached, reconsume the EOF - * character. - */ - switch (c) { - case '>': - emitComment(0, pos); - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '-': - appendStrBuf(c); - state = transition(state, Tokenizer.BOGUS_COMMENT_HYPHEN, reconsume, pos); - break boguscommentloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - appendStrBuf(c); - continue; - } - } - // FALLTHRU DON'T REORDER - case BOGUS_COMMENT_HYPHEN: - boguscommenthyphenloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case '>': - // [NOCPP[ - maybeAppendSpaceToBogusComment(); - // ]NOCPP] - emitComment(0, pos); - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '-': - appendSecondHyphenToBogusComment(); - continue boguscommenthyphenloop; - case '\r': - appendStrBufCarriageReturn(); - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - break stateloop; - case '\n': - appendStrBufLineFeed(); - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - appendStrBuf(c); - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - } - } - // XXX reorder point - case SCRIPT_DATA: - scriptdataloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - switch (c) { - case '<': - /* - * U+003C LESS-THAN SIGN (<) Switch to the - * script data less-than sign state. - */ - flushChars(buf, pos); - returnState = state; - state = transition(state, Tokenizer.SCRIPT_DATA_LESS_THAN_SIGN, reconsume, pos); - break scriptdataloop; // FALL THRU continue - // stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the current input - * character as a character token. Stay in the - * script data state. - */ - continue; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_LESS_THAN_SIGN: - scriptdatalessthansignloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case '/': - /* - * U+002F SOLIDUS (/) Set the temporary buffer - * to the empty string. Switch to the script - * data end tag open state. - */ - index = 0; - clearStrBufBeforeUse(); - state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); - continue stateloop; - case '!': - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - cstart = pos; - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START, reconsume, pos); - break scriptdatalessthansignloop; // FALL THRU - // continue - // stateloop; - default: - /* - * Otherwise, emit a U+003C LESS-THAN SIGN - * character token - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - /* - * and reconsume the current input character in - * the data state. - */ - cstart = pos; - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_ESCAPE_START: - scriptdataescapestartloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Emit a U+002D - * HYPHEN-MINUS character token. Switch to the - * script data escape start dash state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPE_START_DASH, reconsume, pos); - break scriptdataescapestartloop; // FALL THRU - // continue - // stateloop; - default: - /* - * Anything else Reconsume the current input - * character in the script data state. - */ - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_ESCAPE_START_DASH: - scriptdataescapestartdashloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Emit a U+002D - * HYPHEN-MINUS character token. Switch to the - * script data escaped dash dash state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); - break scriptdataescapestartdashloop; - // continue stateloop; - default: - /* - * Anything else Reconsume the current input - * character in the script data state. - */ - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_ESCAPED_DASH_DASH: - scriptdataescapeddashdashloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Emit a U+002D - * HYPHEN-MINUS character token. Stay in the - * script data escaped dash dash state. - */ - continue; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Switch to the - * script data escaped less-than sign state. - */ - flushChars(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit a U+003E - * GREATER-THAN SIGN character token. Switch to - * the script data state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); - continue stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - break scriptdataescapeddashdashloop; - case '\r': - emitCarriageReturn(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the current input - * character as a character token. Switch to the - * script data escaped state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - break scriptdataescapeddashdashloop; - // continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_ESCAPED: - scriptdataescapedloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Emit a U+002D - * HYPHEN-MINUS character token. Switch to the - * script data escaped dash state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH, reconsume, pos); - break scriptdataescapedloop; // FALL THRU - // continue - // stateloop; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Switch to the - * script data escaped less-than sign state. - */ - flushChars(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); - continue stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the current input - * character as a character token. Stay in the - * script data escaped state. - */ - continue; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_ESCAPED_DASH: - scriptdataescapeddashloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Emit a U+002D - * HYPHEN-MINUS character token. Switch to the - * script data escaped dash dash state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_DASH_DASH, reconsume, pos); - continue stateloop; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Switch to the - * script data escaped less-than sign state. - */ - flushChars(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN, reconsume, pos); - break scriptdataescapeddashloop; - // continue stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - continue stateloop; - case '\r': - emitCarriageReturn(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the current input - * character as a character token. Switch to the - * script data escaped state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: - scriptdataescapedlessthanloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '/': - /* - * U+002F SOLIDUS (/) Set the temporary buffer - * to the empty string. Switch to the script - * data escaped end tag open state. - */ - index = 0; - clearStrBufBeforeUse(); - returnState = Tokenizer.SCRIPT_DATA_ESCAPED; - state = transition(state, Tokenizer.NON_DATA_END_TAG_NAME, reconsume, pos); - continue stateloop; - case 'S': - case 's': - /* - * U+0041 LATIN CAPITAL LETTER A through to - * U+005A LATIN CAPITAL LETTER Z Emit a U+003C - * LESS-THAN SIGN character token and the - * current input character as a character token. - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - cstart = pos; - index = 1; - /* - * Set the temporary buffer to the empty string. - * Append the lowercase version of the current - * input character (add 0x0020 to the - * character's code point) to the temporary - * buffer. Switch to the script data double - * escape start state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_START, reconsume, pos); - break scriptdataescapedlessthanloop; - // continue stateloop; - default: - /* - * Anything else Emit a U+003C LESS-THAN SIGN - * character token and reconsume the current - * input character in the script data escaped - * state. - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - cstart = pos; - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_DOUBLE_ESCAPE_START: - scriptdatadoubleescapestartloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - assert index > 0; - if (index < 6) { // SCRIPT_ARR.length - char folded = c; - if (c >= 'A' && c <= 'Z') { - folded += 0x20; - } - if (folded != Tokenizer.SCRIPT_ARR[index]) { - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - continue stateloop; - } - index++; - continue; - } - switch (c) { - case '\r': - emitCarriageReturn(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - case ' ': - case '\t': - case '\u000C': - case '/': - case '>': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN - * (>) Emit the current input character as a - * character token. If the temporary buffer is - * the string "script", then switch to the - * script data double escaped state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - break scriptdatadoubleescapestartloop; - // continue stateloop; - default: - /* - * Anything else Reconsume the current input - * character in the script data escaped state. - */ - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_DOUBLE_ESCAPED: - scriptdatadoubleescapedloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Emit a U+002D - * HYPHEN-MINUS character token. Switch to the - * script data double escaped dash state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH, reconsume, pos); - break scriptdatadoubleescapedloop; // FALL THRU - // continue - // stateloop; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Emit a U+003C - * LESS-THAN SIGN character token. Switch to the - * script data double escaped less-than sign - * state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); - continue stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - continue; - case '\r': - emitCarriageReturn(buf, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the current input - * character as a character token. Stay in the - * script data double escaped state. - */ - continue; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_DOUBLE_ESCAPED_DASH: - scriptdatadoubleescapeddashloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Emit a U+002D - * HYPHEN-MINUS character token. Switch to the - * script data double escaped dash dash state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH, reconsume, pos); - break scriptdatadoubleescapeddashloop; - // continue stateloop; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Emit a U+003C - * LESS-THAN SIGN character token. Switch to the - * script data double escaped less-than sign - * state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); - continue stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - continue stateloop; - case '\r': - emitCarriageReturn(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the current input - * character as a character token. Switch to the - * script data double escaped state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_DOUBLE_ESCAPED_DASH_DASH: - scriptdatadoubleescapeddashdashloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '-': - /* - * U+002D HYPHEN-MINUS (-) Emit a U+002D - * HYPHEN-MINUS character token. Stay in the - * script data double escaped dash dash state. - */ - continue; - case '<': - /* - * U+003C LESS-THAN SIGN (<) Emit a U+003C - * LESS-THAN SIGN character token. Switch to the - * script data double escaped less-than sign - * state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN, reconsume, pos); - break scriptdatadoubleescapeddashdashloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit a U+003E - * GREATER-THAN SIGN character token. Switch to - * the script data state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA, reconsume, pos); - continue stateloop; - case '\u0000': - emitReplacementCharacter(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - continue stateloop; - case '\r': - emitCarriageReturn(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - default: - /* - * Anything else Emit the current input - * character as a character token. Switch to the - * script data double escaped state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_DOUBLE_ESCAPED_LESS_THAN_SIGN: - scriptdatadoubleescapedlessthanloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '/': - /* - * U+002F SOLIDUS (/) Emit a U+002F SOLIDUS - * character token. Set the temporary buffer to - * the empty string. Switch to the script data - * double escape end state. - */ - index = 0; - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPE_END, reconsume, pos); - break scriptdatadoubleescapedlessthanloop; - default: - /* - * Anything else Reconsume the current input - * character in the script data double escaped - * state. - */ - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - continue stateloop; - } - } - // WARNING FALLTHRU CASE TRANSITION: DON'T REORDER - case SCRIPT_DATA_DOUBLE_ESCAPE_END: - scriptdatadoubleescapeendloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - if (index < 6) { // SCRIPT_ARR.length - char folded = c; - if (c >= 'A' && c <= 'Z') { - folded += 0x20; - } - if (folded != Tokenizer.SCRIPT_ARR[index]) { - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - continue stateloop; - } - index++; - continue; - } - switch (c) { - case '\r': - emitCarriageReturn(buf, pos); - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - case ' ': - case '\t': - case '\u000C': - case '/': - case '>': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * U+002F SOLIDUS (/) U+003E GREATER-THAN SIGN - * (>) Emit the current input character as a - * character token. If the temporary buffer is - * the string "script", then switch to the - * script data escaped state. - */ - state = transition(state, Tokenizer.SCRIPT_DATA_ESCAPED, reconsume, pos); - continue stateloop; - default: - /* - * Reconsume the current input character in the - * script data double escaped state. - */ - reconsume = true; - state = transition(state, Tokenizer.SCRIPT_DATA_DOUBLE_ESCAPED, reconsume, pos); - continue stateloop; - } - } - // XXX reorder point - case MARKUP_DECLARATION_OCTYPE: - markupdeclarationdoctypeloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - if (index < 6) { // OCTYPE.length - char folded = c; - if (c >= 'A' && c <= 'Z') { - folded += 0x20; - } - if (folded == Tokenizer.OCTYPE[index]) { - appendStrBuf(c); - } else { - errBogusComment(); - reconsume = true; - state = transition(state, Tokenizer.BOGUS_COMMENT, reconsume, pos); - continue stateloop; - } - index++; - continue; - } else { - reconsume = true; - state = transition(state, Tokenizer.DOCTYPE, reconsume, pos); - break markupdeclarationdoctypeloop; - // continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case DOCTYPE: - doctypeloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - initDoctypeFields(); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the before DOCTYPE name state. - */ - state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); - break doctypeloop; - // continue stateloop; - default: - /* - * Anything else Parse error. - */ - errMissingSpaceBeforeDoctypeName(); - /* - * Reconsume the current character in the before - * DOCTYPE name state. - */ - reconsume = true; - state = transition(state, Tokenizer.BEFORE_DOCTYPE_NAME, reconsume, pos); - break doctypeloop; - // continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case BEFORE_DOCTYPE_NAME: - beforedoctypenameloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the before DOCTYPE name state. - */ - continue; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Parse error. - */ - errNamelessDoctype(); - /* - * Create a new DOCTYPE token. Set its - * force-quirks flag to on. - */ - forceQuirks = true; - /* - * Emit the token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - if (c >= 'A' && c <= 'Z') { - /* - * U+0041 LATIN CAPITAL LETTER A through to - * U+005A LATIN CAPITAL LETTER Z Create a - * new DOCTYPE token. Set the token's name - * to the lowercase version of the input - * character (add 0x0020 to the character's - * code point). - */ - c += 0x20; - } - /* Anything else Create a new DOCTYPE token. */ - /* - * Set the token's name name to the current - * input character. - */ - clearStrBufBeforeUse(); - appendStrBuf(c); - /* - * Switch to the DOCTYPE name state. - */ - state = transition(state, Tokenizer.DOCTYPE_NAME, reconsume, pos); - break beforedoctypenameloop; - // continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case DOCTYPE_NAME: - doctypenameloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - strBufToDoctypeName(); - state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the after DOCTYPE name state. - */ - strBufToDoctypeName(); - state = transition(state, Tokenizer.AFTER_DOCTYPE_NAME, reconsume, pos); - break doctypenameloop; - // continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * DOCTYPE token. - */ - strBufToDoctypeName(); - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * U+0041 LATIN CAPITAL LETTER A through to - * U+005A LATIN CAPITAL LETTER Z Append the - * lowercase version of the input character (add - * 0x0020 to the character's code point) to the - * current DOCTYPE token's name. - */ - if (c >= 'A' && c <= 'Z') { - c += 0x0020; - } - /* - * Anything else Append the current input - * character to the current DOCTYPE token's - * name. - */ - appendStrBuf(c); - /* - * Stay in the DOCTYPE name state. - */ - continue; - } - } - // FALLTHRU DON'T REORDER - case AFTER_DOCTYPE_NAME: - afterdoctypenameloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the after DOCTYPE name state. - */ - continue; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case 'p': - case 'P': - index = 0; - state = transition(state, Tokenizer.DOCTYPE_UBLIC, reconsume, pos); - break afterdoctypenameloop; - // continue stateloop; - case 's': - case 'S': - index = 0; - state = transition(state, Tokenizer.DOCTYPE_YSTEM, reconsume, pos); - continue stateloop; - default: - /* - * Otherwise, this is the parse error. - */ - bogusDoctype(); - - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - // done by bogusDoctype(); - /* - * Switch to the bogus DOCTYPE state. - */ - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case DOCTYPE_UBLIC: - doctypeublicloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * If the six characters starting from the current input - * character are an ASCII case-insensitive match for the - * word "PUBLIC", then consume those characters and - * switch to the before DOCTYPE public identifier state. - */ - if (index < 5) { // UBLIC.length - char folded = c; - if (c >= 'A' && c <= 'Z') { - folded += 0x20; - } - if (folded != Tokenizer.UBLIC[index]) { - bogusDoctype(); - // forceQuirks = true; - reconsume = true; - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - index++; - continue; - } else { - reconsume = true; - state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_KEYWORD, reconsume, pos); - break doctypeublicloop; - // continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case AFTER_DOCTYPE_PUBLIC_KEYWORD: - afterdoctypepublickeywordloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the before DOCTYPE public - * identifier state. - */ - state = transition(state, Tokenizer.BEFORE_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); - break afterdoctypepublickeywordloop; - // FALL THROUGH continue stateloop - case '"': - /* - * U+0022 QUOTATION MARK (") Parse Error. - */ - errNoSpaceBetweenDoctypePublicKeywordAndQuote(); - /* - * Set the DOCTYPE token's public identifier to - * the empty string (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE public identifier - * (double-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); - continue stateloop; - case '\'': - /* - * U+0027 APOSTROPHE (') Parse Error. - */ - errNoSpaceBetweenDoctypePublicKeywordAndQuote(); - /* - * Set the DOCTYPE token's public identifier to - * the empty string (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE public identifier - * (single-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); - continue stateloop; - case '>': - /* U+003E GREATER-THAN SIGN (>) Parse error. */ - errExpectedPublicId(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - default: - bogusDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - // done by bogusDoctype(); - /* - * Switch to the bogus DOCTYPE state. - */ - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: - beforedoctypepublicidentifierloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the before DOCTYPE public identifier - * state. - */ - continue; - case '"': - /* - * U+0022 QUOTATION MARK (") Set the DOCTYPE - * token's public identifier to the empty string - * (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE public identifier - * (double-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); - break beforedoctypepublicidentifierloop; - // continue stateloop; - case '\'': - /* - * U+0027 APOSTROPHE (') Set the DOCTYPE token's - * public identifier to the empty string (not - * missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE public identifier - * (single-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); - continue stateloop; - case '>': - /* U+003E GREATER-THAN SIGN (>) Parse error. */ - errExpectedPublicId(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - default: - bogusDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - // done by bogusDoctype(); - /* - * Switch to the bogus DOCTYPE state. - */ - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: - doctypepublicidentifierdoublequotedloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '"': - /* - * U+0022 QUOTATION MARK (") Switch to the after - * DOCTYPE public identifier state. - */ - publicIdentifier = strBufToString(); - state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); - break doctypepublicidentifierdoublequotedloop; - // continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Parse error. - */ - errGtInPublicId(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - publicIdentifier = strBufToString(); - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append the current input - * character to the current DOCTYPE token's - * public identifier. - */ - appendStrBuf(c); - /* - * Stay in the DOCTYPE public identifier - * (double-quoted) state. - */ - continue; - } - } - // FALLTHRU DON'T REORDER - case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: - afterdoctypepublicidentifierloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the between DOCTYPE public and - * system identifiers state. - */ - state = transition(state, Tokenizer.BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS, reconsume, pos); - break afterdoctypepublicidentifierloop; - // continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '"': - /* - * U+0022 QUOTATION MARK (") Parse error. - */ - errNoSpaceBetweenPublicAndSystemIds(); - /* - * Set the DOCTYPE token's system identifier to - * the empty string (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE system identifier - * (double-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); - continue stateloop; - case '\'': - /* - * U+0027 APOSTROPHE (') Parse error. - */ - errNoSpaceBetweenPublicAndSystemIds(); - /* - * Set the DOCTYPE token's system identifier to - * the empty string (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE system identifier - * (single-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); - continue stateloop; - default: - bogusDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - // done by bogusDoctype(); - /* - * Switch to the bogus DOCTYPE state. - */ - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: - betweendoctypepublicandsystemidentifiersloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the between DOCTYPE public and system - * identifiers state. - */ - continue; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '"': - /* - * U+0022 QUOTATION MARK (") Set the DOCTYPE - * token's system identifier to the empty string - * (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE system identifier - * (double-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); - break betweendoctypepublicandsystemidentifiersloop; - // continue stateloop; - case '\'': - /* - * U+0027 APOSTROPHE (') Set the DOCTYPE token's - * system identifier to the empty string (not - * missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE system identifier - * (single-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); - continue stateloop; - default: - bogusDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - // done by bogusDoctype(); - /* - * Switch to the bogus DOCTYPE state. - */ - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: - doctypesystemidentifierdoublequotedloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '"': - /* - * U+0022 QUOTATION MARK (") Switch to the after - * DOCTYPE system identifier state. - */ - systemIdentifier = strBufToString(); - state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); - continue stateloop; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Parse error. - */ - errGtInSystemId(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - systemIdentifier = strBufToString(); - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append the current input - * character to the current DOCTYPE token's - * system identifier. - */ - appendStrBuf(c); - /* - * Stay in the DOCTYPE system identifier - * (double-quoted) state. - */ - continue; - } - } - // FALLTHRU DON'T REORDER - case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: - afterdoctypesystemidentifierloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the after DOCTYPE system identifier state. - */ - continue; - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit the current - * DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - default: - /* - * Switch to the bogus DOCTYPE state. (This does - * not set the DOCTYPE token's force-quirks flag - * to on.) - */ - bogusDoctypeWithoutQuirks(); - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - break afterdoctypesystemidentifierloop; - // continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case BOGUS_DOCTYPE: - for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '>': - /* - * U+003E GREATER-THAN SIGN (>) Emit that - * DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - default: - /* - * Anything else Stay in the bogus DOCTYPE - * state. - */ - continue; - } - } - // XXX reorder point - case DOCTYPE_YSTEM: - doctypeystemloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Otherwise, if the six characters starting from the - * current input character are an ASCII case-insensitive - * match for the word "SYSTEM", then consume those - * characters and switch to the before DOCTYPE system - * identifier state. - */ - if (index < 5) { // YSTEM.length - char folded = c; - if (c >= 'A' && c <= 'Z') { - folded += 0x20; - } - if (folded != Tokenizer.YSTEM[index]) { - bogusDoctype(); - reconsume = true; - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - index++; - continue stateloop; - } else { - reconsume = true; - state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_KEYWORD, reconsume, pos); - break doctypeystemloop; - // continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case AFTER_DOCTYPE_SYSTEM_KEYWORD: - afterdoctypesystemkeywordloop: for (;;) { - if (reconsume) { - reconsume = false; - } else { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - } - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE - * Switch to the before DOCTYPE public - * identifier state. - */ - state = transition(state, Tokenizer.BEFORE_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); - break afterdoctypesystemkeywordloop; - // FALL THROUGH continue stateloop - case '"': - /* - * U+0022 QUOTATION MARK (") Parse Error. - */ - errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); - /* - * Set the DOCTYPE token's system identifier to - * the empty string (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE public identifier - * (double-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); - continue stateloop; - case '\'': - /* - * U+0027 APOSTROPHE (') Parse Error. - */ - errNoSpaceBetweenDoctypeSystemKeywordAndQuote(); - /* - * Set the DOCTYPE token's public identifier to - * the empty string (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE public identifier - * (single-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); - continue stateloop; - case '>': - /* U+003E GREATER-THAN SIGN (>) Parse error. */ - errExpectedPublicId(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - default: - bogusDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - // done by bogusDoctype(); - /* - * Switch to the bogus DOCTYPE state. - */ - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: - beforedoctypesystemidentifierloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\r': - silentCarriageReturn(); - break stateloop; - case '\n': - silentLineFeed(); - // fall thru - case ' ': - case '\t': - case '\u000C': - /* - * U+0009 CHARACTER TABULATION U+000A LINE FEED - * (LF) U+000C FORM FEED (FF) U+0020 SPACE Stay - * in the before DOCTYPE system identifier - * state. - */ - continue; - case '"': - /* - * U+0022 QUOTATION MARK (") Set the DOCTYPE - * token's system identifier to the empty string - * (not missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE system identifier - * (double-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED, reconsume, pos); - continue stateloop; - case '\'': - /* - * U+0027 APOSTROPHE (') Set the DOCTYPE token's - * system identifier to the empty string (not - * missing), - */ - clearStrBufBeforeUse(); - /* - * then switch to the DOCTYPE system identifier - * (single-quoted) state. - */ - state = transition(state, Tokenizer.DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED, reconsume, pos); - break beforedoctypesystemidentifierloop; - // continue stateloop; - case '>': - /* U+003E GREATER-THAN SIGN (>) Parse error. */ - errExpectedSystemId(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - default: - bogusDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - // done by bogusDoctype(); - /* - * Switch to the bogus DOCTYPE state. - */ - state = transition(state, Tokenizer.BOGUS_DOCTYPE, reconsume, pos); - continue stateloop; - } - } - // FALLTHRU DON'T REORDER - case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: - for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\'': - /* - * U+0027 APOSTROPHE (') Switch to the after - * DOCTYPE system identifier state. - */ - systemIdentifier = strBufToString(); - state = transition(state, Tokenizer.AFTER_DOCTYPE_SYSTEM_IDENTIFIER, reconsume, pos); - continue stateloop; - case '>': - errGtInSystemId(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - systemIdentifier = strBufToString(); - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append the current input - * character to the current DOCTYPE token's - * system identifier. - */ - appendStrBuf(c); - /* - * Stay in the DOCTYPE system identifier - * (double-quoted) state. - */ - continue; - } - } - // XXX reorder point - case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: - for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - /* - * Consume the next input character: - */ - switch (c) { - case '\'': - /* - * U+0027 APOSTROPHE (') Switch to the after - * DOCTYPE public identifier state. - */ - publicIdentifier = strBufToString(); - state = transition(state, Tokenizer.AFTER_DOCTYPE_PUBLIC_IDENTIFIER, reconsume, pos); - continue stateloop; - case '>': - errGtInPublicId(); - /* - * Set the DOCTYPE token's force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - publicIdentifier = strBufToString(); - emitDoctypeToken(pos); - /* - * Switch to the data state. - */ - state = transition(state, Tokenizer.DATA, reconsume, pos); - continue stateloop; - case '\r': - appendStrBufCarriageReturn(); - break stateloop; - case '\n': - appendStrBufLineFeed(); - continue; - case '\u0000': - c = '\uFFFD'; - // fall thru - default: - /* - * Anything else Append the current input - * character to the current DOCTYPE token's - * public identifier. - */ - appendStrBuf(c); - /* - * Stay in the DOCTYPE public identifier - * (single-quoted) state. - */ - continue; - } - } - // XXX reorder point - case PROCESSING_INSTRUCTION: - processinginstructionloop: for (;;) { - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case '?': - state = transition( - state, - Tokenizer.PROCESSING_INSTRUCTION_QUESTION_MARK, - reconsume, pos); - break processinginstructionloop; - // continue stateloop; - default: - continue; - } - } - case PROCESSING_INSTRUCTION_QUESTION_MARK: - if (++pos == endPos) { - break stateloop; - } - c = checkChar(buf, pos); - switch (c) { - case '>': - state = transition(state, Tokenizer.DATA, - reconsume, pos); - continue stateloop; - default: - state = transition(state, - Tokenizer.PROCESSING_INSTRUCTION, - reconsume, pos); - continue stateloop; - } - // END HOTSPOT WORKAROUND - } - } - flushChars(buf, pos); - /* - * if (prevCR && pos != endPos) { // why is this needed? pos--; col--; } - */ - // Save locals - stateSave = state; - returnStateSave = returnState; - return pos; - } - - // HOTSPOT WORKAROUND INSERTION POINT - - // [NOCPP[ - - protected int transition(int from, int to, boolean reconsume, int pos) throws SAXException { - return to; - } - - // ]NOCPP] - - private void initDoctypeFields() { - // Discard the characters "DOCTYPE" accumulated as a potential bogus - // comment into strBuf. - clearStrBufAfterUse(); - doctypeName = ""; - if (systemIdentifier != null) { - Portability.releaseString(systemIdentifier); - systemIdentifier = null; - } - if (publicIdentifier != null) { - Portability.releaseString(publicIdentifier); - publicIdentifier = null; - } - forceQuirks = false; - } - - @Inline private void adjustDoubleHyphenAndAppendToStrBufCarriageReturn() - throws SAXException { - silentCarriageReturn(); - adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); - } - - @Inline private void adjustDoubleHyphenAndAppendToStrBufLineFeed() - throws SAXException { - silentLineFeed(); - adjustDoubleHyphenAndAppendToStrBufAndErr('\n'); - } - - @Inline private void appendStrBufLineFeed() { - silentLineFeed(); - appendStrBuf('\n'); - } - - @Inline private void appendStrBufCarriageReturn() { - silentCarriageReturn(); - appendStrBuf('\n'); - } - - @Inline protected void silentCarriageReturn() { - ++line; - lastCR = true; - } - - @Inline protected void silentLineFeed() { - ++line; - } - - private void emitCarriageReturn(@NoLength char[] buf, int pos) - throws SAXException { - silentCarriageReturn(); - flushChars(buf, pos); - tokenHandler.characters(Tokenizer.LF, 0, 1); - cstart = Integer.MAX_VALUE; - } - - private void emitReplacementCharacter(@NoLength char[] buf, int pos) - throws SAXException { - flushChars(buf, pos); - tokenHandler.zeroOriginatingReplacementCharacter(); - cstart = pos + 1; - } - - private void emitPlaintextReplacementCharacter(@NoLength char[] buf, int pos) - throws SAXException { - flushChars(buf, pos); - tokenHandler.characters(REPLACEMENT_CHARACTER, 0, 1); - cstart = pos + 1; - } - - private void setAdditionalAndRememberAmpersandLocation(char add) { - additional = add; - // [NOCPP[ - ampersandLocation = new LocatorImpl(this); - // ]NOCPP] - } - - private void bogusDoctype() throws SAXException { - errBogusDoctype(); - forceQuirks = true; - } - - private void bogusDoctypeWithoutQuirks() throws SAXException { - errBogusDoctype(); - forceQuirks = false; - } - - private void handleNcrValue(int returnState) throws SAXException { - /* - * If one or more characters match the range, then take them all and - * interpret the string of characters as a number (either hexadecimal or - * decimal as appropriate). - */ - if (value <= 0xFFFF) { - if (value >= 0x80 && value <= 0x9f) { - /* - * If that number is one of the numbers in the first column of - * the following table, then this is a parse error. - */ - errNcrInC1Range(); - /* - * Find the row with that number in the first column, and return - * a character token for the Unicode character given in the - * second column of that row. - */ - @NoLength char[] val = NamedCharacters.WINDOWS_1252[value - 0x80]; - emitOrAppendOne(val, returnState); - // [NOCPP[ - } else if (value == 0xC - && contentSpacePolicy != XmlViolationPolicy.ALLOW) { - if (contentSpacePolicy == XmlViolationPolicy.ALTER_INFOSET) { - emitOrAppendOne(Tokenizer.SPACE, returnState); - } else if (contentSpacePolicy == XmlViolationPolicy.FATAL) { - fatal("A character reference expanded to a form feed which is not legal XML 1.0 white space."); - } - // ]NOCPP] - } else if (value == 0x0) { - errNcrZero(); - emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); - } else if ((value & 0xF800) == 0xD800) { - errNcrSurrogate(); - emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); - } else { - /* - * Otherwise, return a character token for the Unicode character - * whose code point is that number. - */ - char ch = (char) value; - // [NOCPP[ - if (value == 0x0D) { - errNcrCr(); - } else if ((value <= 0x0008) || (value == 0x000B) - || (value >= 0x000E && value <= 0x001F)) { - ch = errNcrControlChar(ch); - } else if (value >= 0xFDD0 && value <= 0xFDEF) { - errNcrUnassigned(); - } else if ((value & 0xFFFE) == 0xFFFE) { - ch = errNcrNonCharacter(ch); - } else if (value >= 0x007F && value <= 0x009F) { - errNcrControlChar(); - } else { - maybeWarnPrivateUse(ch); - } - // ]NOCPP] - bmpChar[0] = ch; - emitOrAppendOne(bmpChar, returnState); - } - } else if (value <= 0x10FFFF) { - // [NOCPP[ - maybeWarnPrivateUseAstral(); - if ((value & 0xFFFE) == 0xFFFE) { - errAstralNonCharacter(value); - } - // ]NOCPP] - astralChar[0] = (char) (Tokenizer.LEAD_OFFSET + (value >> 10)); - astralChar[1] = (char) (0xDC00 + (value & 0x3FF)); - emitOrAppendTwo(astralChar, returnState); - } else { - errNcrOutOfRange(); - emitOrAppendOne(Tokenizer.REPLACEMENT_CHARACTER, returnState); - } - } - - public void eof() throws SAXException { - int state = stateSave; - int returnState = returnStateSave; - - eofloop: for (;;) { - switch (state) { - case SCRIPT_DATA_LESS_THAN_SIGN: - case SCRIPT_DATA_ESCAPED_LESS_THAN_SIGN: - /* - * Otherwise, emit a U+003C LESS-THAN SIGN character token - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - /* - * and reconsume the current input character in the data - * state. - */ - break eofloop; - case TAG_OPEN: - /* - * The behavior of this state depends on the content model - * flag. - */ - /* - * Anything else Parse error. - */ - errEofAfterLt(); - /* - * Emit a U+003C LESS-THAN SIGN character token - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - /* - * and reconsume the current input character in the data - * state. - */ - break eofloop; - case RAWTEXT_RCDATA_LESS_THAN_SIGN: - /* - * Emit a U+003C LESS-THAN SIGN character token - */ - tokenHandler.characters(Tokenizer.LT_GT, 0, 1); - /* - * and reconsume the current input character in the RCDATA - * state. - */ - break eofloop; - case NON_DATA_END_TAG_NAME: - /* - * Emit a U+003C LESS-THAN SIGN character token, a U+002F - * SOLIDUS character token, - */ - tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); - /* - * a character token for each of the characters in the - * temporary buffer (in the order they were added to the - * buffer), - */ - emitStrBuf(); - /* - * and reconsume the current input character in the RCDATA - * state. - */ - break eofloop; - case CLOSE_TAG_OPEN: - /* EOF Parse error. */ - errEofAfterLt(); - /* - * Emit a U+003C LESS-THAN SIGN character token and a U+002F - * SOLIDUS character token. - */ - tokenHandler.characters(Tokenizer.LT_SOLIDUS, 0, 2); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case TAG_NAME: - /* - * EOF Parse error. - */ - errEofInTagName(); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case BEFORE_ATTRIBUTE_NAME: - case AFTER_ATTRIBUTE_VALUE_QUOTED: - case SELF_CLOSING_START_TAG: - /* EOF Parse error. */ - errEofWithoutGt(); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case ATTRIBUTE_NAME: - /* - * EOF Parse error. - */ - errEofInAttributeName(); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case AFTER_ATTRIBUTE_NAME: - case BEFORE_ATTRIBUTE_VALUE: - /* EOF Parse error. */ - errEofWithoutGt(); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case ATTRIBUTE_VALUE_DOUBLE_QUOTED: - case ATTRIBUTE_VALUE_SINGLE_QUOTED: - case ATTRIBUTE_VALUE_UNQUOTED: - /* EOF Parse error. */ - errEofInAttributeValue(); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case BOGUS_COMMENT: - emitComment(0, 0); - break eofloop; - case BOGUS_COMMENT_HYPHEN: - // [NOCPP[ - maybeAppendSpaceToBogusComment(); - // ]NOCPP] - emitComment(0, 0); - break eofloop; - case MARKUP_DECLARATION_OPEN: - errBogusComment(); - emitComment(0, 0); - break eofloop; - case MARKUP_DECLARATION_HYPHEN: - errBogusComment(); - emitComment(0, 0); - break eofloop; - case MARKUP_DECLARATION_OCTYPE: - if (index < 6) { - errBogusComment(); - emitComment(0, 0); - } else { - /* EOF Parse error. */ - errEofInDoctype(); - /* - * Create a new DOCTYPE token. Set its force-quirks flag - * to on. - */ - doctypeName = ""; - if (systemIdentifier != null) { - Portability.releaseString(systemIdentifier); - systemIdentifier = null; - } - if (publicIdentifier != null) { - Portability.releaseString(publicIdentifier); - publicIdentifier = null; - } - forceQuirks = true; - /* - * Emit the token. - */ - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - } - break eofloop; - case COMMENT_START: - case COMMENT: - /* - * EOF Parse error. - */ - errEofInComment(); - /* Emit the comment token. */ - emitComment(0, 0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case COMMENT_END: - errEofInComment(); - /* Emit the comment token. */ - emitComment(2, 0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case COMMENT_END_DASH: - case COMMENT_START_DASH: - errEofInComment(); - /* Emit the comment token. */ - emitComment(1, 0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case COMMENT_END_BANG: - errEofInComment(); - /* Emit the comment token. */ - emitComment(3, 0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case DOCTYPE: - case BEFORE_DOCTYPE_NAME: - errEofInDoctype(); - /* - * Create a new DOCTYPE token. Set its force-quirks flag to - * on. - */ - forceQuirks = true; - /* - * Emit the token. - */ - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case DOCTYPE_NAME: - errEofInDoctype(); - strBufToDoctypeName(); - /* - * Set the DOCTYPE token's force-quirks flag to on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case DOCTYPE_UBLIC: - case DOCTYPE_YSTEM: - case AFTER_DOCTYPE_NAME: - case AFTER_DOCTYPE_PUBLIC_KEYWORD: - case AFTER_DOCTYPE_SYSTEM_KEYWORD: - case BEFORE_DOCTYPE_PUBLIC_IDENTIFIER: - errEofInDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case DOCTYPE_PUBLIC_IDENTIFIER_DOUBLE_QUOTED: - case DOCTYPE_PUBLIC_IDENTIFIER_SINGLE_QUOTED: - /* EOF Parse error. */ - errEofInPublicId(); - /* - * Set the DOCTYPE token's force-quirks flag to on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - publicIdentifier = strBufToString(); - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case AFTER_DOCTYPE_PUBLIC_IDENTIFIER: - case BEFORE_DOCTYPE_SYSTEM_IDENTIFIER: - case BETWEEN_DOCTYPE_PUBLIC_AND_SYSTEM_IDENTIFIERS: - errEofInDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case DOCTYPE_SYSTEM_IDENTIFIER_DOUBLE_QUOTED: - case DOCTYPE_SYSTEM_IDENTIFIER_SINGLE_QUOTED: - /* EOF Parse error. */ - errEofInSystemId(); - /* - * Set the DOCTYPE token's force-quirks flag to on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - systemIdentifier = strBufToString(); - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case AFTER_DOCTYPE_SYSTEM_IDENTIFIER: - errEofInDoctype(); - /* - * Set the DOCTYPE token's force-quirks flag to on. - */ - forceQuirks = true; - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case BOGUS_DOCTYPE: - /* - * Emit that DOCTYPE token. - */ - emitDoctypeToken(0); - /* - * Reconsume the EOF character in the data state. - */ - break eofloop; - case CONSUME_CHARACTER_REFERENCE: - /* - * Unlike the definition is the spec, this state does not - * return a value and never requires the caller to - * backtrack. This state takes care of emitting characters - * or appending to the current attribute value. It also - * takes care of that in the case when consuming the entity - * fails. - */ - /* - * This section defines how to consume an entity. This - * definition is used when parsing entities in text and in - * attributes. - * - * The behavior depends on the identity of the next - * character (the one immediately after the U+0026 AMPERSAND - * character): - */ - - emitOrAppendCharRefBuf(returnState); - state = returnState; - continue; - case CHARACTER_REFERENCE_HILO_LOOKUP: - errNoNamedCharacterMatch(); - emitOrAppendCharRefBuf(returnState); - state = returnState; - continue; - case CHARACTER_REFERENCE_TAIL: - outer: for (;;) { - char c = '\u0000'; - entCol++; - /* - * Consume the maximum number of characters possible, - * with the consumed characters matching one of the - * identifiers in the first column of the named - * character references table (in a case-sensitive - * manner). - */ - hiloop: for (;;) { - if (hi == -1) { - break hiloop; - } - if (entCol == NamedCharacters.NAMES[hi].length()) { - break hiloop; - } - if (entCol > NamedCharacters.NAMES[hi].length()) { - break outer; - } else if (c < NamedCharacters.NAMES[hi].charAt(entCol)) { - hi--; - } else { - break hiloop; - } - } - - loloop: for (;;) { - if (hi < lo) { - break outer; - } - if (entCol == NamedCharacters.NAMES[lo].length()) { - candidate = lo; - charRefBufMark = charRefBufLen; - lo++; - } else if (entCol > NamedCharacters.NAMES[lo].length()) { - break outer; - } else if (c > NamedCharacters.NAMES[lo].charAt(entCol)) { - lo++; - } else { - break loloop; - } - } - if (hi < lo) { - break outer; - } - continue; - } - - if (candidate == -1) { - /* - * If no match can be made, then this is a parse error. - */ - errNoNamedCharacterMatch(); - emitOrAppendCharRefBuf(returnState); - state = returnState; - continue eofloop; - } else { - @Const @CharacterName String candidateName = NamedCharacters.NAMES[candidate]; - if (candidateName.length() == 0 - || candidateName.charAt(candidateName.length() - 1) != ';') { - /* - * If the last character matched is not a U+003B - * SEMICOLON (;), there is a parse error. - */ - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - /* - * If the entity is being consumed as part of an - * attribute, and the last character matched is - * not a U+003B SEMICOLON (;), - */ - char ch; - if (charRefBufMark == charRefBufLen) { - ch = '\u0000'; - } else { - ch = charRefBuf[charRefBufMark]; - } - if ((ch >= '0' && ch <= '9') - || (ch >= 'A' && ch <= 'Z') - || (ch >= 'a' && ch <= 'z')) { - /* - * and the next character is in the range - * U+0030 DIGIT ZERO to U+0039 DIGIT NINE, - * U+0041 LATIN CAPITAL LETTER A to U+005A - * LATIN CAPITAL LETTER Z, or U+0061 LATIN - * SMALL LETTER A to U+007A LATIN SMALL - * LETTER Z, then, for historical reasons, - * all the characters that were matched - * after the U+0026 AMPERSAND (&) must be - * unconsumed, and nothing is returned. - */ - errNoNamedCharacterMatch(); - appendCharRefBufToStrBuf(); - state = returnState; - continue eofloop; - } - } - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - errUnescapedAmpersandInterpretedAsCharacterReference(); - } else { - errNotSemicolonTerminated(); - } - } - - /* - * Otherwise, return a character token for the character - * corresponding to the entity name (as given by the - * second column of the named character references - * table). - */ - @Const @NoLength char[] val = NamedCharacters.VALUES[candidate]; - if ( - // [NOCPP[ - val.length == 1 - // ]NOCPP] - // CPPONLY: val[1] == 0 - ) { - emitOrAppendOne(val, returnState); - } else { - emitOrAppendTwo(val, returnState); - } - // this is so complicated! - if (charRefBufMark < charRefBufLen) { - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - appendStrBuf(charRefBuf, charRefBufMark, - charRefBufLen - charRefBufMark); - } else { - tokenHandler.characters(charRefBuf, charRefBufMark, - charRefBufLen - charRefBufMark); - } - } - charRefBufLen = 0; - state = returnState; - continue eofloop; - /* - * If the markup contains I'm ¬it; I tell you, the - * entity is parsed as "not", as in, I'm ¬it; I tell - * you. But if the markup was I'm ∉ I tell you, - * the entity would be parsed as "notin;", resulting in - * I'm ∉ I tell you. - */ - } - case CONSUME_NCR: - case DECIMAL_NRC_LOOP: - case HEX_NCR_LOOP: - /* - * If no characters match the range, then don't consume any - * characters (and unconsume the U+0023 NUMBER SIGN - * character and, if appropriate, the X character). This is - * a parse error; nothing is returned. - * - * Otherwise, if the next character is a U+003B SEMICOLON, - * consume that too. If it isn't, there is a parse error. - */ - if (!seenDigits) { - errNoDigitsInNCR(); - emitOrAppendCharRefBuf(returnState); - state = returnState; - continue; - } else { - errCharRefLacksSemicolon(); - } - // WARNING previous state sets reconsume - handleNcrValue(returnState); - state = returnState; - continue; - case CDATA_RSQB: - tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 1); - break eofloop; - case CDATA_RSQB_RSQB: - tokenHandler.characters(Tokenizer.RSQB_RSQB, 0, 2); - break eofloop; - case DATA: - default: - break eofloop; - } - } - // case DATA: - /* - * EOF Emit an end-of-file token. - */ - tokenHandler.eof(); - return; - } - - private void emitDoctypeToken(int pos) throws SAXException { - cstart = pos + 1; - tokenHandler.doctype(doctypeName, publicIdentifier, systemIdentifier, - forceQuirks); - // It is OK and sufficient to release these here, since - // there's no way out of the doctype states than through paths - // that call this method. - doctypeName = null; - Portability.releaseString(publicIdentifier); - publicIdentifier = null; - Portability.releaseString(systemIdentifier); - systemIdentifier = null; - } - - @Inline protected char checkChar(@NoLength char[] buf, int pos) - throws SAXException { - return buf[pos]; - } - - public boolean internalEncodingDeclaration(String internalCharset) - throws SAXException { - if (encodingDeclarationHandler != null) { - return encodingDeclarationHandler.internalEncodingDeclaration(internalCharset); - } - return false; - } - - /** - * @param val - * @throws SAXException - */ - private void emitOrAppendTwo(@Const @NoLength char[] val, int returnState) - throws SAXException { - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - appendStrBuf(val[0]); - appendStrBuf(val[1]); - } else { - tokenHandler.characters(val, 0, 2); - } - } - - private void emitOrAppendOne(@Const @NoLength char[] val, int returnState) - throws SAXException { - if ((returnState & DATA_AND_RCDATA_MASK) != 0) { - appendStrBuf(val[0]); - } else { - tokenHandler.characters(val, 0, 1); - } - } - - public void end() throws SAXException { - strBuf = null; - doctypeName = null; - if (systemIdentifier != null) { - Portability.releaseString(systemIdentifier); - systemIdentifier = null; - } - if (publicIdentifier != null) { - Portability.releaseString(publicIdentifier); - publicIdentifier = null; - } - if (tagName != null) { - tagName.release(); - tagName = null; - } - if (attributeName != null) { - attributeName.release(); - attributeName = null; - } - tokenHandler.endTokenization(); - if (attributes != null) { - // [NOCPP[ - attributes = null; - // ]NOCPP] - // CPPONLY: attributes.clear(mappingLangToXmlLang); - } - } - - public void requestSuspension() { - shouldSuspend = true; - } - - // [NOCPP[ - - public void becomeConfident() { - confident = true; - } - - /** - * Returns the nextCharOnNewLine. - * - * @return the nextCharOnNewLine - */ - public boolean isNextCharOnNewLine() { - return false; - } - - public boolean isPrevCR() { - return lastCR; - } - - /** - * Returns the line. - * - * @return the line - */ - public int getLine() { - return -1; - } - - /** - * Returns the col. - * - * @return the col - */ - public int getCol() { - return -1; - } - - // ]NOCPP] - - public boolean isInDataState() { - return (stateSave == DATA); - } - - public void resetToDataState() { - clearStrBufAfterUse(); - charRefBufLen = 0; - stateSave = Tokenizer.DATA; - // line = 1; XXX line numbers - lastCR = false; - index = 0; - forceQuirks = false; - additional = '\u0000'; - entCol = -1; - firstCharKey = -1; - lo = 0; - hi = 0; // will always be overwritten before use anyway - candidate = -1; - charRefBufMark = 0; - value = 0; - seenDigits = false; - endTag = false; - shouldSuspend = false; - initDoctypeFields(); - if (tagName != null) { - tagName.release(); - tagName = null; - } - if (attributeName != null) { - attributeName.release(); - attributeName = null; - } - if (newAttributesEachTime) { - if (attributes != null) { - Portability.delete(attributes); - attributes = null; - } - } - } - - public void loadState(Tokenizer other) throws SAXException { - strBufLen = other.strBufLen; - if (strBufLen > strBuf.length) { - strBuf = new char[strBufLen]; - } - System.arraycopy(other.strBuf, 0, strBuf, 0, strBufLen); - - charRefBufLen = other.charRefBufLen; - System.arraycopy(other.charRefBuf, 0, charRefBuf, 0, charRefBufLen); - - stateSave = other.stateSave; - returnStateSave = other.returnStateSave; - endTagExpectation = other.endTagExpectation; - endTagExpectationAsArray = other.endTagExpectationAsArray; - // line = 1; XXX line numbers - lastCR = other.lastCR; - index = other.index; - forceQuirks = other.forceQuirks; - additional = other.additional; - entCol = other.entCol; - firstCharKey = other.firstCharKey; - lo = other.lo; - hi = other.hi; - candidate = other.candidate; - charRefBufMark = other.charRefBufMark; - value = other.value; - seenDigits = other.seenDigits; - endTag = other.endTag; - shouldSuspend = false; - - if (other.doctypeName == null) { - doctypeName = null; - } else { - doctypeName = Portability.newLocalFromLocal(other.doctypeName, - interner); - } - - Portability.releaseString(systemIdentifier); - if (other.systemIdentifier == null) { - systemIdentifier = null; - } else { - systemIdentifier = Portability.newStringFromString(other.systemIdentifier); - } - - Portability.releaseString(publicIdentifier); - if (other.publicIdentifier == null) { - publicIdentifier = null; - } else { - publicIdentifier = Portability.newStringFromString(other.publicIdentifier); - } - - if (tagName != null) { - tagName.release(); - } - if (other.tagName == null) { - tagName = null; - } else { - tagName = other.tagName.cloneElementName(interner); - } - - if (attributeName != null) { - attributeName.release(); - } - if (other.attributeName == null) { - attributeName = null; - } else { - attributeName = other.attributeName.cloneAttributeName(interner); - } - - Portability.delete(attributes); - if (other.attributes == null) { - attributes = null; - } else { - attributes = other.attributes.cloneAttributes(interner); - } - } - - public void initializeWithoutStarting() throws SAXException { - confident = false; - strBuf = null; - line = 1; - // CPPONLY: attributeLine = 1; - // [NOCPP[ - html4 = false; - metaBoundaryPassed = false; - wantsComments = tokenHandler.wantsComments(); - if (!newAttributesEachTime) { - attributes = new HtmlAttributes(mappingLangToXmlLang); - } - // ]NOCPP] - resetToDataState(); - } - - protected void errGarbageAfterLtSlash() throws SAXException { - } - - protected void errLtSlashGt() throws SAXException { - } - - protected void errWarnLtSlashInRcdata() throws SAXException { - } - - protected void errHtml4LtSlashInRcdata(char folded) throws SAXException { - } - - protected void errCharRefLacksSemicolon() throws SAXException { - } - - protected void errNoDigitsInNCR() throws SAXException { - } - - protected void errGtInSystemId() throws SAXException { - } - - protected void errGtInPublicId() throws SAXException { - } - - protected void errNamelessDoctype() throws SAXException { - } - - protected void errConsecutiveHyphens() throws SAXException { - } - - protected void errPrematureEndOfComment() throws SAXException { - } - - protected void errBogusComment() throws SAXException { - } - - protected void errUnquotedAttributeValOrNull(char c) throws SAXException { - } - - protected void errSlashNotFollowedByGt() throws SAXException { - } - - protected void errHtml4XmlVoidSyntax() throws SAXException { - } - - protected void errNoSpaceBetweenAttributes() throws SAXException { - } - - protected void errHtml4NonNameInUnquotedAttribute(char c) - throws SAXException { - } - - protected void errLtOrEqualsOrGraveInUnquotedAttributeOrNull(char c) - throws SAXException { - } - - protected void errAttributeValueMissing() throws SAXException { - } - - protected void errBadCharBeforeAttributeNameOrNull(char c) - throws SAXException { - } - - protected void errEqualsSignBeforeAttributeName() throws SAXException { - } - - protected void errBadCharAfterLt(char c) throws SAXException { - } - - protected void errLtGt() throws SAXException { - } - - protected void errProcessingInstruction() throws SAXException { - } - - protected void errUnescapedAmpersandInterpretedAsCharacterReference() - throws SAXException { - } - - protected void errNotSemicolonTerminated() throws SAXException { - } - - protected void errNoNamedCharacterMatch() throws SAXException { - } - - protected void errQuoteBeforeAttributeName(char c) throws SAXException { - } - - protected void errQuoteOrLtInAttributeNameOrNull(char c) - throws SAXException { - } - - protected void errExpectedPublicId() throws SAXException { - } - - protected void errBogusDoctype() throws SAXException { - } - - protected void maybeWarnPrivateUseAstral() throws SAXException { - } - - protected void maybeWarnPrivateUse(char ch) throws SAXException { - } - - protected void maybeErrAttributesOnEndTag(HtmlAttributes attrs) - throws SAXException { - } - - protected void maybeErrSlashInEndTag(boolean selfClosing) - throws SAXException { - } - - protected char errNcrNonCharacter(char ch) throws SAXException { - return ch; - } - - protected void errAstralNonCharacter(int ch) throws SAXException { - } - - protected void errNcrSurrogate() throws SAXException { - } - - protected char errNcrControlChar(char ch) throws SAXException { - return ch; - } - - protected void errNcrCr() throws SAXException { - } - - protected void errNcrInC1Range() throws SAXException { - } - - protected void errEofInPublicId() throws SAXException { - } - - protected void errEofInComment() throws SAXException { - } - - protected void errEofInDoctype() throws SAXException { - } - - protected void errEofInAttributeValue() throws SAXException { - } - - protected void errEofInAttributeName() throws SAXException { - } - - protected void errEofWithoutGt() throws SAXException { - } - - protected void errEofInTagName() throws SAXException { - } - - protected void errEofInEndTag() throws SAXException { - } - - protected void errEofAfterLt() throws SAXException { - } - - protected void errNcrOutOfRange() throws SAXException { - } - - protected void errNcrUnassigned() throws SAXException { - } - - protected void errDuplicateAttribute() throws SAXException { - } - - protected void errEofInSystemId() throws SAXException { - } - - protected void errExpectedSystemId() throws SAXException { - } - - protected void errMissingSpaceBeforeDoctypeName() throws SAXException { - } - - protected void errHyphenHyphenBang() throws SAXException { - } - - protected void errNcrControlChar() throws SAXException { - } - - protected void errNcrZero() throws SAXException { - } - - protected void errNoSpaceBetweenDoctypeSystemKeywordAndQuote() - throws SAXException { - } - - protected void errNoSpaceBetweenPublicAndSystemIds() throws SAXException { - } - - protected void errNoSpaceBetweenDoctypePublicKeywordAndQuote() - throws SAXException { - } - - protected void noteAttributeWithoutValue() throws SAXException { - } - - protected void noteUnquotedAttributeValue() throws SAXException { - } - - /** - * Sets the encodingDeclarationHandler. - * - * @param encodingDeclarationHandler - * the encodingDeclarationHandler to set - */ - public void setEncodingDeclarationHandler( - EncodingDeclarationHandler encodingDeclarationHandler) { - this.encodingDeclarationHandler = encodingDeclarationHandler; - } - - void destructor() { - // The translator will write refcount tracing stuff here - Portability.delete(attributes); - attributes = null; - } - - // [NOCPP[ - - /** - * Sets an offset to be added to the position reported to - * <code>TransitionHandler</code>. - * - * @param offset the offset - */ - public void setTransitionBaseOffset(int offset) { - - } - - // ]NOCPP] - -} |