diff options
Diffstat (limited to 'components')
42 files changed, 8646 insertions, 0 deletions
diff --git a/components/htmlparser/moz.build b/components/htmlparser/moz.build new file mode 100644 index 000000000..ddcad7b1a --- /dev/null +++ b/components/htmlparser/moz.build @@ -0,0 +1,49 @@ +# -*- Mode: python; indent-tabs-mode: nil; tab-width: 40 -*- +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, You can obtain one at http://mozilla.org/MPL/2.0/. + +XPIDL_SOURCES += [ + 'public/nsIExpatSink.idl', + 'public/nsIExtendedExpatSink.idl', +] + +EXPORTS += [ + 'src/nsElementTable.h', + 'src/nsHTMLTagList.h', + 'src/nsHTMLTags.h', + 'src/nsIContentSink.h', + 'src/nsIDTD.h', + 'src/nsIFragmentContentSink.h', + 'src/nsIHTMLContentSink.h', + 'src/nsIParser.h', + 'src/nsIParserService.h', + 'src/nsITokenizer.h', + 'src/nsParserBase.h', + 'src/nsParserCIID.h', + 'src/nsParserConstants.h', + 'src/nsScannerString.h', + 'src/nsToken.h', +] + +SOURCES += [ + 'src/CNavDTD.cpp', + 'src/CParserContext.cpp', + 'src/nsElementTable.cpp', + 'src/nsExpatDriver.cpp', + 'src/nsHTMLEntities.cpp', + 'src/nsHTMLTags.cpp', + 'src/nsHTMLTokenizer.cpp', + 'src/nsParser.cpp', + 'src/nsParserModule.cpp', + 'src/nsParserMsgUtils.cpp', + 'src/nsParserService.cpp', + 'src/nsScanner.cpp', + 'src/nsScannerString.cpp', +] + +if CONFIG['GNU_CXX']: + CXXFLAGS += ['-Wno-error=shadow'] + +XPIDL_MODULE = 'htmlparser' +FINAL_LIBRARY = 'xul' diff --git a/components/htmlparser/public/nsIExpatSink.idl b/components/htmlparser/public/nsIExpatSink.idl new file mode 100644 index 000000000..df0b2d869 --- /dev/null +++ b/components/htmlparser/public/nsIExpatSink.idl @@ -0,0 +1,109 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.idl" +interface nsIScriptError; + +/** + * This interface should be implemented by any content sink that wants + * to get output from expat and do something with it; in other words, + * by any sink that handles some sort of XML dialect. + */ + +[scriptable, uuid(01f681af-0f22-4725-a914-0d396114daf0)] +interface nsIExpatSink : nsISupports +{ + /** + * Called to handle the opening tag of an element. + * @param aName the fully qualified tagname of the element + * @param aAtts the array of attribute names and values. There are + * aAttsCount/2 names and aAttsCount/2 values, so the total number of + * elements in the array is aAttsCount. The names and values + * alternate. Thus, if we number attributes starting with 0, + * aAtts[2*k] is the name of the k-th attribute and aAtts[2*k+1] is + * the value of that attribute Both explicitly specified attributes + * and attributes that are defined to have default values in a DTD are + * present in aAtts. + * @param aAttsCount the number of elements in aAtts. + * @param aLineNumber the line number of the start tag in the data stream. + */ + void HandleStartElement(in wstring aName, + [array, size_is(aAttsCount)] in wstring aAtts, + in unsigned long aAttsCount, + in unsigned long aLineNumber); + + /** + * Called to handle the closing tag of an element. + * @param aName the fully qualified tagname of the element + */ + void HandleEndElement(in wstring aName); + + /** + * Called to handle a comment + * @param aCommentText the text of the comment (not including the + * "<!--" and "-->") + */ + void HandleComment(in wstring aCommentText); + + /** + * Called to handle a CDATA section + * @param aData the text in the CDATA section. This is null-terminated. + * @param aLength the length of the aData string + */ + void HandleCDataSection([size_is(aLength)] in wstring aData, + in unsigned long aLength); + + /** + * Called to handle the doctype declaration + */ + void HandleDoctypeDecl(in AString aSubset, + in AString aName, + in AString aSystemId, + in AString aPublicId, + in nsISupports aCatalogData); + + /** + * Called to handle character data. Note that this does NOT get + * called for the contents of CDATA sections. + * @param aData the data to handle. aData is NOT NULL-TERMINATED. + * @param aLength the length of the aData string + */ + void HandleCharacterData([size_is(aLength)] in wstring aData, + in unsigned long aLength); + + /** + * Called to handle a processing instruction + * @param aTarget the PI target (e.g. xml-stylesheet) + * @param aData all the rest of the data in the PI + */ + void HandleProcessingInstruction(in wstring aTarget, + in wstring aData); + + /** + * Handle the XML Declaration. + * + * @param aVersion The version string, can be null if not specified. + * @param aEncoding The encoding string, can be null if not specified. + * @param aStandalone -1, 0, or 1 indicating respectively that there was no + * standalone parameter in the declaration, that it was + * given as no, or that it was given as yes. + */ + void HandleXMLDeclaration(in wstring aVersion, + in wstring aEncoding, + in long aStandalone); + + /** + * Ask the content sink if the expat driver should log an error to the console. + * + * @param aErrorText Error message to pass to content sink. + * @param aSourceText Source text of the document we're parsing. + * @param aError Script error object with line number & column number + * + * @retval True if the expat driver should report the error. + */ + boolean ReportError(in wstring aErrorText, + in wstring aSourceText, + in nsIScriptError aError); +}; diff --git a/components/htmlparser/public/nsIExtendedExpatSink.idl b/components/htmlparser/public/nsIExtendedExpatSink.idl new file mode 100644 index 000000000..d88f0d974 --- /dev/null +++ b/components/htmlparser/public/nsIExtendedExpatSink.idl @@ -0,0 +1,72 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsIExpatSink.idl" + +/** + * This interface provides notification of syntax-level events. + */ +[scriptable, uuid(5e3e4f0c-7b77-47ca-a7c5-a3d87f2a9c82)] +interface nsIExtendedExpatSink : nsIExpatSink +{ + /** + * Called at the beginning of the DTD, before any entity or notation + * events. + * @param aDoctypeName The document type name. + * @param aSysid The declared system identifier for the external DTD subset, + * or null if none was declared. + * @param aPubid The declared public identifier for the external DTD subset, + * or null if none was declared. + */ + void handleStartDTD(in wstring aDoctypeName, + in wstring aSysid, + in wstring aPubid); + + /** + * Called when a prefix mapping starts to be in-scope, before any + * startElement events. + * @param aPrefix The Namespace prefix being declared. An empty string + * is used for the default element namespace, which has + * no prefix. + * @param aUri The Namespace URI the prefix is mapped to. + */ + void handleStartNamespaceDecl(in wstring aPrefix, + in wstring aUri); + + /** + * Called when a prefix mapping is no longer in-scope, after any + * endElement events. + * @param aPrefix The prefix that was being mapped. This is the empty string + * when a default mapping scope ends. + */ + void handleEndNamespaceDecl(in wstring aPrefix); + + /** + * This is called for a declaration of notation. The base argument is + * whatever was set by XML_SetBase. aNotationName will never be + * null. The other arguments can be. + * @param aNotationName The notation name. + * @param aSysId The notation's system identifier, or null if none was given. + * @param aPubId The notation's pubilc identifier, or null if none was given. + */ + void handleNotationDecl(in wstring aNotationName, + in wstring aSysid, + in wstring aPubid); + + /** + * This is called for a declaration of an unparsed (NDATA) entity. + * aName, aSysid and aNotationName arguments will never be + * null. The other arguments may be. + * @param aName The unparsed entity's name. + * @param aSysId The notation's system identifier. + * @param aPubId The notation's pubilc identifier, or null if none was given. + * @param aNotationName The name of the associated notation. + */ + void handleUnparsedEntityDecl(in wstring aName, + in wstring aSysid, + in wstring aPubid, + in wstring aNotationName); + +}; diff --git a/components/htmlparser/src/CNavDTD.cpp b/components/htmlparser/src/CNavDTD.cpp new file mode 100644 index 000000000..decc6a963 --- /dev/null +++ b/components/htmlparser/src/CNavDTD.cpp @@ -0,0 +1,90 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsISupports.h" +#include "nsISupportsImpl.h" +#include "nsIParser.h" +#include "CNavDTD.h" +#include "nsIHTMLContentSink.h" + +NS_IMPL_ISUPPORTS(CNavDTD, nsIDTD); + +CNavDTD::CNavDTD() +{ +} + +CNavDTD::~CNavDTD() +{ +} + +NS_IMETHODIMP +CNavDTD::WillBuildModel(const CParserContext& aParserContext, + nsITokenizer* aTokenizer, + nsIContentSink* aSink) +{ + return NS_OK; +} + +NS_IMETHODIMP +CNavDTD::BuildModel(nsITokenizer* aTokenizer, + nsIContentSink* aSink) +{ + // NB: It is important to throw STOPPARSING if the sink is the wrong type in + // order to make sure nsParser cleans up properly after itself. + nsCOMPtr<nsIHTMLContentSink> sink = do_QueryInterface(aSink); + if (!sink) { + return NS_ERROR_HTMLPARSER_STOPPARSING; + } + + nsresult rv = sink->OpenContainer(nsIHTMLContentSink::eHTML); + NS_ENSURE_SUCCESS(rv, rv); + rv = sink->OpenContainer(nsIHTMLContentSink::eBody); + NS_ENSURE_SUCCESS(rv, rv); + + rv = sink->CloseContainer(nsIHTMLContentSink::eBody); + MOZ_ASSERT(NS_SUCCEEDED(rv)); + rv = sink->CloseContainer(nsIHTMLContentSink::eHTML); + MOZ_ASSERT(NS_SUCCEEDED(rv)); + + return NS_OK; +} + +NS_IMETHODIMP +CNavDTD::DidBuildModel(nsresult anErrorCode) +{ + return NS_OK; +} + +NS_IMETHODIMP_(void) +CNavDTD::Terminate() +{ +} + + +NS_IMETHODIMP_(int32_t) +CNavDTD::GetType() +{ + return NS_IPARSER_FLAG_HTML; +} + +NS_IMETHODIMP_(nsDTDMode) +CNavDTD::GetMode() const +{ + return eDTDMode_quirks; +} + +NS_IMETHODIMP_(bool) +CNavDTD::CanContain(int32_t aParent,int32_t aChild) const +{ + MOZ_CRASH("nobody calls this"); + return false; +} + +NS_IMETHODIMP_(bool) +CNavDTD::IsContainer(int32_t aTag) const +{ + MOZ_CRASH("nobody calls this"); + return false; +} diff --git a/components/htmlparser/src/CNavDTD.h b/components/htmlparser/src/CNavDTD.h new file mode 100644 index 000000000..b3c557e81 --- /dev/null +++ b/components/htmlparser/src/CNavDTD.h @@ -0,0 +1,35 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef NS_NAVHTMLDTD__ +#define NS_NAVHTMLDTD__ + +#include "nsIDTD.h" +#include "nsISupports.h" +#include "nsCOMPtr.h" + +#ifdef _MSC_VER +#pragma warning( disable : 4275 ) +#endif + +class CNavDTD : public nsIDTD +{ +#ifdef _MSC_VER +#pragma warning( default : 4275 ) +#endif + + virtual ~CNavDTD(); + +public: + CNavDTD(); + + NS_DECL_ISUPPORTS + NS_DECL_NSIDTD +}; + +#endif + + + diff --git a/components/htmlparser/src/CParserContext.cpp b/components/htmlparser/src/CParserContext.cpp new file mode 100644 index 000000000..3b764d7e4 --- /dev/null +++ b/components/htmlparser/src/CParserContext.cpp @@ -0,0 +1,85 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + +#include "nsIAtom.h" +#include "CParserContext.h" +#include "nsToken.h" +#include "prenv.h" +#include "nsIHTMLContentSink.h" +#include "nsHTMLTokenizer.h" +#include "nsMimeTypes.h" +#include "nsHTMLTokenizer.h" + +CParserContext::CParserContext(CParserContext* aPrevContext, + nsScanner* aScanner, + void *aKey, + eParserCommands aCommand, + nsIRequestObserver* aListener, + eAutoDetectResult aStatus, + bool aCopyUnused) + : mListener(aListener), + mKey(aKey), + mPrevContext(aPrevContext), + mScanner(aScanner), + mDTDMode(eDTDMode_unknown), + mStreamListenerState(eNone), + mContextType(eCTNone), + mAutoDetectStatus(aStatus), + mParserCommand(aCommand), + mMultipart(true), + mCopyUnused(aCopyUnused) +{ + MOZ_COUNT_CTOR(CParserContext); +} + +CParserContext::~CParserContext() +{ + // It's ok to simply ingore the PrevContext. + MOZ_COUNT_DTOR(CParserContext); +} + +void +CParserContext::SetMimeType(const nsACString& aMimeType) +{ + mMimeType.Assign(aMimeType); + + mDocType = ePlainText; + + if (mMimeType.EqualsLiteral(TEXT_HTML)) + mDocType = eHTML_Strict; + else if (mMimeType.EqualsLiteral(TEXT_XML) || + mMimeType.EqualsLiteral(APPLICATION_XML) || + mMimeType.EqualsLiteral(APPLICATION_XHTML_XML) || + mMimeType.EqualsLiteral(TEXT_XUL) || + mMimeType.EqualsLiteral(IMAGE_SVG_XML) || + mMimeType.EqualsLiteral(APPLICATION_MATHML_XML) || + mMimeType.EqualsLiteral(APPLICATION_RDF_XML) || + mMimeType.EqualsLiteral(APPLICATION_WAPXHTML_XML) || + mMimeType.EqualsLiteral(TEXT_RDF)) + mDocType = eXML; +} + +nsresult +CParserContext::GetTokenizer(nsIDTD* aDTD, + nsIContentSink* aSink, + nsITokenizer*& aTokenizer) +{ + nsresult result = NS_OK; + int32_t type = aDTD ? aDTD->GetType() : NS_IPARSER_FLAG_HTML; + + if (!mTokenizer) { + if (type == NS_IPARSER_FLAG_HTML || mParserCommand == eViewSource) { + mTokenizer = new nsHTMLTokenizer; + } + else if (type == NS_IPARSER_FLAG_XML) { + mTokenizer = do_QueryInterface(aDTD, &result); + } + } + + aTokenizer = mTokenizer; + + return result; +} diff --git a/components/htmlparser/src/CParserContext.h b/components/htmlparser/src/CParserContext.h new file mode 100644 index 000000000..8850b83d5 --- /dev/null +++ b/components/htmlparser/src/CParserContext.h @@ -0,0 +1,70 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * MODULE NOTES: + * @update gess 4/1/98 + * + */ + +#ifndef __CParserContext +#define __CParserContext + +#include "nsIParser.h" +#include "nsIURL.h" +#include "nsIDTD.h" +#include "nsIStreamListener.h" +#include "nsIRequest.h" +#include "nsScanner.h" +#include "nsString.h" +#include "nsCOMPtr.h" +#include "nsAutoPtr.h" + +/** + * Note that the parser is given FULL access to all + * data in a parsercontext. Hey, that what it's for! + */ + +class CParserContext { +public: + enum eContextType {eCTNone,eCTURL,eCTString,eCTStream}; + + CParserContext(CParserContext* aPrevContext, + nsScanner* aScanner, + void* aKey = 0, + eParserCommands aCommand = eViewNormal, + nsIRequestObserver* aListener = 0, + eAutoDetectResult aStatus = eUnknownDetect, + bool aCopyUnused = false); + + ~CParserContext(); + + nsresult GetTokenizer(nsIDTD* aDTD, + nsIContentSink* aSink, + nsITokenizer*& aTokenizer); + void SetMimeType(const nsACString& aMimeType); + + nsCOMPtr<nsIRequest> mRequest; // provided by necko to differnciate different input streams + // why is mRequest strongly referenced? see bug 102376. + nsCOMPtr<nsIRequestObserver> mListener; + void* const mKey; + nsCOMPtr<nsITokenizer> mTokenizer; + CParserContext* const mPrevContext; + nsAutoPtr<nsScanner> mScanner; + + nsCString mMimeType; + nsDTDMode mDTDMode; + + eParserDocType mDocType; + eStreamState mStreamListenerState; + eContextType mContextType; + eAutoDetectResult mAutoDetectStatus; + eParserCommands mParserCommand; + + bool mMultipart; + bool mCopyUnused; +}; + +#endif diff --git a/components/htmlparser/src/nsElementTable.cpp b/components/htmlparser/src/nsElementTable.cpp new file mode 100644 index 000000000..7ab4c48b1 --- /dev/null +++ b/components/htmlparser/src/nsElementTable.cpp @@ -0,0 +1,210 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsElementTable.h" + +struct HTMLElement +{ +#ifdef DEBUG + nsHTMLTag mTagID; +#endif + bool mIsBlock; + bool mIsContainer; +}; + +#ifdef DEBUG +#define ELEM(tag, block, container) { eHTMLTag_##tag, block, container }, +#else +#define ELEM(tag, block, container) { block, container }, +#endif + +#define ____ false // This makes the table easier to read. + +// Note that the mIsBlock field disagrees with +// https://developer.mozilla.org/en-US/docs/Web/HTML/Block-level_elements for +// the following elements: center, details, dialog, dir, dt, figcaption, +// listing, menu, multicol, noscript, output, summary, tfoot, video. +// +// mrbkap thinks that the field values were pulled from the old HTML4 DTD and +// then got modified in mostly random ways to make the old parser's behavior +// compatible with the web. So it might make sense to change the mIsBlock +// values for the abovementioned tags at some point. +// + +static const HTMLElement gHTMLElements[] = { + ELEM(unknown, ____, ____) + ELEM(a, ____, true) + ELEM(abbr, ____, true) + ELEM(acronym, ____, true) + ELEM(address, true, true) + ELEM(applet, ____, true) + ELEM(area, ____, ____) + ELEM(article, true, true) + ELEM(aside, true, true) + ELEM(audio, ____, true) + ELEM(b, ____, true) + ELEM(base, ____, ____) + ELEM(basefont, ____, ____) + ELEM(bdo, ____, true) + ELEM(bgsound, ____, ____) + ELEM(big, ____, true) + ELEM(blockquote, true, true) + ELEM(body, ____, true) + ELEM(br, ____, ____) + ELEM(button, ____, true) + ELEM(canvas, ____, true) + ELEM(caption, ____, true) + ELEM(center, true, true) + ELEM(cite, ____, true) + ELEM(code, ____, true) + ELEM(col, ____, ____) + ELEM(colgroup, ____, true) + ELEM(data, ____, true) + ELEM(datalist, ____, true) + ELEM(dd, ____, true) + ELEM(del, ____, true) + ELEM(details, true, true) + ELEM(dfn, ____, true) + ELEM(dialog, true, true) + ELEM(dir, true, true) + ELEM(div, true, true) + ELEM(dl, true, true) + ELEM(dt, ____, true) + ELEM(em, ____, true) + ELEM(embed, ____, ____) + ELEM(fieldset, true, true) + ELEM(figcaption, ____, true) + ELEM(figure, true, true) + ELEM(font, ____, true) + ELEM(footer, true, true) + ELEM(form, true, true) + ELEM(frame, ____, ____) + ELEM(frameset, ____, true) + ELEM(h1, true, true) + ELEM(h2, true, true) + ELEM(h3, true, true) + ELEM(h4, true, true) + ELEM(h5, true, true) + ELEM(h6, true, true) + ELEM(head, ____, true) + ELEM(header, true, true) + ELEM(hgroup, true, true) + ELEM(hr, true, ____) + ELEM(html, ____, true) + ELEM(i, ____, true) + ELEM(iframe, ____, true) + ELEM(image, ____, ____) + ELEM(img, ____, ____) + ELEM(input, ____, ____) + ELEM(ins, ____, true) + ELEM(kbd, ____, true) + ELEM(keygen, ____, ____) + ELEM(label, ____, true) + ELEM(legend, ____, true) + ELEM(li, true, true) + ELEM(link, ____, ____) + ELEM(listing, true, true) + ELEM(main, true, true) + ELEM(map, ____, true) + ELEM(mark, ____, true) + ELEM(menu, true, true) + ELEM(menuitem, ____, true) + ELEM(meta, ____, ____) + ELEM(meter, ____, true) + ELEM(multicol, true, true) + ELEM(nav, true, true) + ELEM(nobr, ____, true) + ELEM(noembed, ____, true) + ELEM(noframes, ____, true) + ELEM(noscript, ____, true) + ELEM(object, ____, true) + ELEM(ol, true, true) + ELEM(optgroup, ____, true) + ELEM(option, ____, true) + ELEM(output, ____, true) + ELEM(p, true, true) + ELEM(param, ____, ____) + ELEM(picture, ____, true) + ELEM(plaintext, ____, true) + ELEM(pre, true, true) + ELEM(progress, ____, true) + ELEM(q, ____, true) + ELEM(rb, ____, true) + ELEM(rp, ____, true) + ELEM(rt, ____, true) + ELEM(rtc, ____, true) + ELEM(ruby, ____, true) + ELEM(s, ____, true) + ELEM(samp, ____, true) + ELEM(script, ____, true) + ELEM(section, true, true) + ELEM(select, ____, true) + ELEM(small, ____, true) + ELEM(slot, ____, true) + ELEM(source, ____, ____) + ELEM(span, ____, true) + ELEM(strike, ____, true) + ELEM(strong, ____, true) + ELEM(style, ____, true) + ELEM(sub, ____, true) + ELEM(summary, true, true) + ELEM(sup, ____, true) + ELEM(table, true, true) + ELEM(tbody, ____, true) + ELEM(td, ____, true) + ELEM(textarea, ____, true) + ELEM(tfoot, ____, true) + ELEM(th, ____, true) + ELEM(thead, ____, true) + ELEM(template, ____, true) + ELEM(time, ____, true) + ELEM(title, ____, true) + ELEM(tr, ____, true) + ELEM(track, ____, ____) + ELEM(tt, ____, true) + ELEM(u, ____, true) + ELEM(ul, true, true) + ELEM(var, ____, true) + ELEM(video, ____, true) + ELEM(wbr, ____, ____) + ELEM(xmp, ____, true) + ELEM(text, ____, ____) + ELEM(whitespace, ____, ____) + ELEM(newline, ____, ____) + ELEM(comment, ____, true) + ELEM(entity, ____, true) + ELEM(doctypeDecl, ____, true) + ELEM(markupDecl, ____, true) + ELEM(instruction, ____, true) + ELEM(userdefined, ____, true) +}; + +#undef ELEM +#undef ____ + +bool +nsHTMLElement::IsContainer(nsHTMLTag aId) +{ + return gHTMLElements[aId].mIsContainer; +} + +bool +nsHTMLElement::IsBlock(nsHTMLTag aId) +{ + return gHTMLElements[aId].mIsBlock; +} + +#ifdef DEBUG +void +CheckElementTable() +{ + for (nsHTMLTag t = eHTMLTag_unknown; + t <= eHTMLTag_userdefined; + t = nsHTMLTag(t + 1)) { + MOZ_ASSERT(gHTMLElements[t].mTagID == t, + "gHTMLElements entries does match tag list."); + } +} +#endif diff --git a/components/htmlparser/src/nsElementTable.h b/components/htmlparser/src/nsElementTable.h new file mode 100644 index 000000000..b456b5989 --- /dev/null +++ b/components/htmlparser/src/nsElementTable.h @@ -0,0 +1,21 @@ +/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsElementTable_h +#define nsElementTable_h + +#include "nsHTMLTags.h" + +#ifdef DEBUG +void CheckElementTable(); +#endif + +struct nsHTMLElement +{ + static bool IsContainer(nsHTMLTag aTag); + static bool IsBlock(nsHTMLTag aTag); +}; + +#endif // nsElementTable_h diff --git a/components/htmlparser/src/nsExpatDriver.cpp b/components/htmlparser/src/nsExpatDriver.cpp new file mode 100644 index 000000000..e35a1da25 --- /dev/null +++ b/components/htmlparser/src/nsExpatDriver.cpp @@ -0,0 +1,1412 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsExpatDriver.h" +#include "nsCOMPtr.h" +#include "nsParserCIID.h" +#include "CParserContext.h" +#include "nsIExpatSink.h" +#include "nsIExtendedExpatSink.h" +#include "nsIContentSink.h" +#include "nsParserMsgUtils.h" +#include "nsIURL.h" +#include "nsIUnicharInputStream.h" +#include "nsIProtocolHandler.h" +#include "nsNetUtil.h" +#include "prprf.h" +#include "prmem.h" +#include "nsTextFormatter.h" +#include "nsDirectoryServiceDefs.h" +#include "nsCRT.h" +#include "nsIConsoleService.h" +#include "nsIScriptError.h" +#include "nsIContentPolicy.h" +#include "nsContentPolicyUtils.h" +#include "nsError.h" +#include "nsXPCOMCIDInternal.h" +#include "nsUnicharInputStream.h" +#include "nsContentUtils.h" +#include "nsNullPrincipal.h" + +#include "mozilla/IntegerTypeTraits.h" +#include "mozilla/Logging.h" + +using mozilla::fallible; +using mozilla::LogLevel; + +#define kExpatSeparatorChar 0xFFFF + +static const char16_t kUTF16[] = { 'U', 'T', 'F', '-', '1', '6', '\0' }; + +static mozilla::LazyLogModule gExpatDriverLog("expatdriver"); + +// The maximum tree depth used for XML-based files (xml/svg/etc.) +static const uint16_t sMaxXMLDepth = 2048; + +/***************************** EXPAT CALL BACKS ******************************/ +// The callback handlers that get called from the expat parser. + +static void +Driver_HandleXMLDeclaration(void *aUserData, + const XML_Char *aVersion, + const XML_Char *aEncoding, + int aStandalone) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + nsExpatDriver* driver = static_cast<nsExpatDriver*>(aUserData); + driver->HandleXMLDeclaration(aVersion, aEncoding, aStandalone); + } +} + +static void +Driver_HandleStartElement(void *aUserData, + const XML_Char *aName, + const XML_Char **aAtts) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)->HandleStartElement(aName, + aAtts); + } +} + +static void +Driver_HandleEndElement(void *aUserData, + const XML_Char *aName) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)->HandleEndElement(aName); + } +} + +static void +Driver_HandleCharacterData(void *aUserData, + const XML_Char *aData, + int aLength) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + nsExpatDriver* driver = static_cast<nsExpatDriver*>(aUserData); + driver->HandleCharacterData(aData, uint32_t(aLength)); + } +} + +static void +Driver_HandleComment(void *aUserData, + const XML_Char *aName) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if(aUserData) { + static_cast<nsExpatDriver*>(aUserData)->HandleComment(aName); + } +} + +static void +Driver_HandleProcessingInstruction(void *aUserData, + const XML_Char *aTarget, + const XML_Char *aData) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + nsExpatDriver* driver = static_cast<nsExpatDriver*>(aUserData); + driver->HandleProcessingInstruction(aTarget, aData); + } +} + +static void +Driver_HandleDefault(void *aUserData, + const XML_Char *aData, + int aLength) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + nsExpatDriver* driver = static_cast<nsExpatDriver*>(aUserData); + driver->HandleDefault(aData, uint32_t(aLength)); + } +} + +static void +Driver_HandleStartCdataSection(void *aUserData) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)->HandleStartCdataSection(); + } +} + +static void +Driver_HandleEndCdataSection(void *aUserData) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)->HandleEndCdataSection(); + } +} + +static void +Driver_HandleStartDoctypeDecl(void *aUserData, + const XML_Char *aDoctypeName, + const XML_Char *aSysid, + const XML_Char *aPubid, + int aHasInternalSubset) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)-> + HandleStartDoctypeDecl(aDoctypeName, aSysid, aPubid, !!aHasInternalSubset); + } +} + +static void +Driver_HandleEndDoctypeDecl(void *aUserData) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)->HandleEndDoctypeDecl(); + } +} + +static int +Driver_HandleExternalEntityRef(void *aExternalEntityRefHandler, + const XML_Char *aOpenEntityNames, + const XML_Char *aBase, + const XML_Char *aSystemId, + const XML_Char *aPublicId) +{ + NS_ASSERTION(aExternalEntityRefHandler, "expat driver should exist"); + if (!aExternalEntityRefHandler) { + return 1; + } + + nsExpatDriver* driver = static_cast<nsExpatDriver*> + (aExternalEntityRefHandler); + + return driver->HandleExternalEntityRef(aOpenEntityNames, aBase, aSystemId, + aPublicId); +} + +static void +Driver_HandleStartNamespaceDecl(void *aUserData, + const XML_Char *aPrefix, + const XML_Char *aUri) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)-> + HandleStartNamespaceDecl(aPrefix, aUri); + } +} + +static void +Driver_HandleEndNamespaceDecl(void *aUserData, + const XML_Char *aPrefix) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)-> + HandleEndNamespaceDecl(aPrefix); + } +} + +static void +Driver_HandleNotationDecl(void *aUserData, + const XML_Char *aNotationName, + const XML_Char *aBase, + const XML_Char *aSysid, + const XML_Char *aPubid) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)-> + HandleNotationDecl(aNotationName, aBase, aSysid, aPubid); + } +} + +static void +Driver_HandleUnparsedEntityDecl(void *aUserData, + const XML_Char *aEntityName, + const XML_Char *aBase, + const XML_Char *aSysid, + const XML_Char *aPubid, + const XML_Char *aNotationName) +{ + NS_ASSERTION(aUserData, "expat driver should exist"); + if (aUserData) { + static_cast<nsExpatDriver*>(aUserData)-> + HandleUnparsedEntityDecl(aEntityName, aBase, aSysid, aPubid, + aNotationName); + } +} + + +/***************************** END CALL BACKS ********************************/ + +/***************************** CATALOG UTILS *********************************/ + +// Initially added for bug 113400 to switch from the remote "XHTML 1.0 plus +// MathML 2.0" DTD to the the lightweight customized version that Mozilla uses. +// Since Mozilla is not validating, no need to fetch a *huge* file at each +// click. +// XXX The cleanest solution here would be to fix Bug 98413: Implement XML +// Catalogs. +struct nsCatalogData { + const char* mPublicID; + const char* mLocalDTD; + const char* mAgentSheet; +}; + +// The order of this table is guestimated to be in the optimum order +static const nsCatalogData kCatalogTable[] = { + { "-//W3C//DTD XHTML 1.0 Transitional//EN", "htmlmathml-f.ent", nullptr }, + { "-//W3C//DTD XHTML 1.1//EN", "htmlmathml-f.ent", nullptr }, + { "-//W3C//DTD XHTML 1.0 Strict//EN", "htmlmathml-f.ent", nullptr }, + { "-//W3C//DTD XHTML 1.0 Frameset//EN", "htmlmathml-f.ent", nullptr }, + { "-//W3C//DTD XHTML Basic 1.0//EN", "htmlmathml-f.ent", nullptr }, + { "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN", "htmlmathml-f.ent", nullptr }, + { "-//W3C//DTD XHTML 1.1 plus MathML 2.0 plus SVG 1.1//EN", "htmlmathml-f.ent", nullptr }, + { "-//W3C//DTD MathML 2.0//EN", "htmlmathml-f.ent", nullptr }, + { "-//WAPFORUM//DTD XHTML Mobile 1.0//EN", "htmlmathml-f.ent", nullptr }, + { nullptr, nullptr, nullptr } +}; + +static const nsCatalogData* +LookupCatalogData(const char16_t* aPublicID) +{ + nsDependentString publicID(aPublicID); + + // linear search for now since the number of entries is going to + // be negligible, and the fix for bug 98413 would get rid of this + // code anyway + const nsCatalogData* data = kCatalogTable; + while (data->mPublicID) { + if (publicID.EqualsASCII(data->mPublicID)) { + return data; + } + ++data; + } + + return nullptr; +} + +// This function provides a resource URI to a local DTD +// in resource://gre/res/dtd/ which may or may not exist. +// If aCatalogData is provided, it is used to remap the +// DTD instead of taking the filename from the URI. +static void +GetLocalDTDURI(const nsCatalogData* aCatalogData, nsIURI* aDTD, + nsIURI** aResult) +{ + NS_ASSERTION(aDTD, "Null parameter."); + + nsAutoCString fileName; + if (aCatalogData) { + // remap the DTD to a known local DTD + fileName.Assign(aCatalogData->mLocalDTD); + } + + if (fileName.IsEmpty()) { + // Try to see if the user has installed the DTD file -- we extract the + // filename.ext of the DTD here. Hence, for any DTD for which we have + // no predefined mapping, users just have to copy the DTD file to our + // special DTD directory and it will be picked. + nsCOMPtr<nsIURL> dtdURL = do_QueryInterface(aDTD); + if (!dtdURL) { + return; + } + + dtdURL->GetFileName(fileName); + if (fileName.IsEmpty()) { + return; + } + } + + nsAutoCString respath("resource://gre/res/dtd/"); + respath += fileName; + NS_NewURI(aResult, respath); +} + +/***************************** END CATALOG UTILS *****************************/ + +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsExpatDriver) + NS_INTERFACE_MAP_ENTRY(nsITokenizer) + NS_INTERFACE_MAP_ENTRY(nsIDTD) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIDTD) +NS_INTERFACE_MAP_END + +NS_IMPL_CYCLE_COLLECTING_ADDREF(nsExpatDriver) +NS_IMPL_CYCLE_COLLECTING_RELEASE(nsExpatDriver) + +NS_IMPL_CYCLE_COLLECTION(nsExpatDriver, mSink, mExtendedSink) + +nsExpatDriver::nsExpatDriver() + : mExpatParser(nullptr), + mInCData(false), + mInInternalSubset(false), + mInExternalDTD(false), + mMadeFinalCallToExpat(false), + mIsFinalChunk(false), + mTagDepth(0), + mInternalState(NS_OK), + mExpatBuffered(0), + mCatalogData(nullptr), + mInnerWindowID(0) +{ +} + +nsExpatDriver::~nsExpatDriver() +{ + if (mExpatParser) { + XML_ParserFree(mExpatParser); + } +} + +void +nsExpatDriver::HandleStartElement(const char16_t *aValue, + const char16_t **aAtts) +{ + NS_ASSERTION(mSink, "content sink not found!"); + + // Calculate the total number of elements in aAtts. + // XML_GetSpecifiedAttributeCount will only give us the number of specified + // attrs (twice that number, actually), so we have to check for default attrs + // ourselves. + uint32_t attrArrayLength; + for (attrArrayLength = XML_GetSpecifiedAttributeCount(mExpatParser); + aAtts[attrArrayLength]; + attrArrayLength += 2) { + // Just looping till we find out what the length is + } + + if (mSink) { + // Sanity check: Make sure the limit fits in the type the tag depth tracker + // was declared as. + static_assert(sMaxXMLDepth <= mozilla::MaxValue<decltype(nsExpatDriver::mTagDepth)>::value, + "Maximum XML parsing depth type mismatch: value too large."); + + if (++mTagDepth >= sMaxXMLDepth) { + MaybeStopParser(NS_ERROR_HTMLPARSER_HIERARCHYTOODEEP); + return; + } + + nsresult rv = mSink-> + HandleStartElement(aValue, aAtts, attrArrayLength, + XML_GetCurrentLineNumber(mExpatParser)); + MaybeStopParser(rv); + } +} + +nsresult +nsExpatDriver::HandleEndElement(const char16_t *aValue) +{ + NS_ASSERTION(mSink, "content sink not found!"); + NS_ASSERTION(mInternalState != NS_ERROR_HTMLPARSER_BLOCK, + "Shouldn't block from HandleStartElement."); + + if (mSink && mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) { + nsresult rv = mSink->HandleEndElement(aValue); + --mTagDepth; + MaybeStopParser(rv); + } + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleCharacterData(const char16_t *aValue, + const uint32_t aLength) +{ + NS_ASSERTION(mSink, "content sink not found!"); + + if (mInCData) { + if (!mCDataText.Append(aValue, aLength, fallible)) { + MaybeStopParser(NS_ERROR_OUT_OF_MEMORY); + } + } + else if (mSink) { + nsresult rv = mSink->HandleCharacterData(aValue, aLength); + MaybeStopParser(rv); + } + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleComment(const char16_t *aValue) +{ + NS_ASSERTION(mSink, "content sink not found!"); + + if (mInExternalDTD) { + // Ignore comments from external DTDs + return NS_OK; + } + + if (mInInternalSubset) { + mInternalSubset.AppendLiteral("<!--"); + mInternalSubset.Append(aValue); + mInternalSubset.AppendLiteral("-->"); + } + else if (mSink) { + nsresult rv = mSink->HandleComment(aValue); + MaybeStopParser(rv); + } + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleProcessingInstruction(const char16_t *aTarget, + const char16_t *aData) +{ + NS_ASSERTION(mSink, "content sink not found!"); + + if (mInExternalDTD) { + // Ignore PIs in external DTDs for now. Eventually we want to + // pass them to the sink in a way that doesn't put them in the DOM + return NS_OK; + } + + if (mInInternalSubset) { + mInternalSubset.AppendLiteral("<?"); + mInternalSubset.Append(aTarget); + mInternalSubset.Append(' '); + mInternalSubset.Append(aData); + mInternalSubset.AppendLiteral("?>"); + } + else if (mSink) { + nsresult rv = mSink->HandleProcessingInstruction(aTarget, aData); + MaybeStopParser(rv); + } + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleXMLDeclaration(const char16_t *aVersion, + const char16_t *aEncoding, + int32_t aStandalone) +{ + if (mSink) { + nsresult rv = mSink->HandleXMLDeclaration(aVersion, aEncoding, aStandalone); + MaybeStopParser(rv); + } + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleDefault(const char16_t *aValue, + const uint32_t aLength) +{ + NS_ASSERTION(mSink, "content sink not found!"); + + if (mInExternalDTD) { + // Ignore newlines in external DTDs + return NS_OK; + } + + if (mInInternalSubset) { + mInternalSubset.Append(aValue, aLength); + } + else if (mSink) { + uint32_t i; + nsresult rv = mInternalState; + for (i = 0; i < aLength && NS_SUCCEEDED(rv); ++i) { + if (aValue[i] == '\n' || aValue[i] == '\r') { + rv = mSink->HandleCharacterData(&aValue[i], 1); + } + } + MaybeStopParser(rv); + } + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleStartCdataSection() +{ + mInCData = true; + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleEndCdataSection() +{ + NS_ASSERTION(mSink, "content sink not found!"); + + mInCData = false; + if (mSink) { + nsresult rv = mSink->HandleCDataSection(mCDataText.get(), + mCDataText.Length()); + MaybeStopParser(rv); + } + mCDataText.Truncate(); + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleStartNamespaceDecl(const char16_t* aPrefix, + const char16_t* aUri) +{ + if (mExtendedSink) { + nsresult rv = mExtendedSink->HandleStartNamespaceDecl(aPrefix, aUri); + MaybeStopParser(rv); + } + return NS_OK; +} + +nsresult +nsExpatDriver::HandleEndNamespaceDecl(const char16_t* aPrefix) +{ + if (mExtendedSink && mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) { + nsresult rv = mExtendedSink->HandleEndNamespaceDecl(aPrefix); + MaybeStopParser(rv); + } + return NS_OK; +} + +nsresult +nsExpatDriver::HandleNotationDecl(const char16_t* aNotationName, + const char16_t* aBase, + const char16_t* aSysid, + const char16_t* aPubid) +{ + if (mExtendedSink) { + nsresult rv = mExtendedSink->HandleNotationDecl(aNotationName, aSysid, + aPubid); + MaybeStopParser(rv); + } + return NS_OK; +} + +nsresult +nsExpatDriver::HandleUnparsedEntityDecl(const char16_t* aEntityName, + const char16_t* aBase, + const char16_t* aSysid, + const char16_t* aPubid, + const char16_t* aNotationName) +{ + if (mExtendedSink) { + nsresult rv = mExtendedSink->HandleUnparsedEntityDecl(aEntityName, + aSysid, + aPubid, + aNotationName); + MaybeStopParser(rv); + } + return NS_OK; +} + +nsresult +nsExpatDriver::HandleStartDoctypeDecl(const char16_t* aDoctypeName, + const char16_t* aSysid, + const char16_t* aPubid, + bool aHasInternalSubset) +{ + mDoctypeName = aDoctypeName; + mSystemID = aSysid; + mPublicID = aPubid; + + if (mExtendedSink) { + nsresult rv = mExtendedSink->HandleStartDTD(aDoctypeName, aSysid, aPubid); + MaybeStopParser(rv); + } + + if (aHasInternalSubset) { + // Consuming a huge internal subset translates to numerous + // allocations. In an effort to avoid too many allocations + // setting mInternalSubset's capacity to be 1K ( just a guesstimate! ). + mInInternalSubset = true; + mInternalSubset.SetCapacity(1024); + } else { + // Distinguish missing internal subset from an empty one + mInternalSubset.SetIsVoid(true); + } + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleEndDoctypeDecl() +{ + NS_ASSERTION(mSink, "content sink not found!"); + + mInInternalSubset = false; + + if (mSink) { + // let the sink know any additional knowledge that we have about the + // document (currently, from bug 124570, we only expect to pass additional + // agent sheets needed to layout the XML vocabulary of the document) + nsCOMPtr<nsIURI> data; +#if 0 + if (mCatalogData && mCatalogData->mAgentSheet) { + NS_NewURI(getter_AddRefs(data), mCatalogData->mAgentSheet); + } +#endif + + // The unused support for "catalog style sheets" was removed. It doesn't + // look like we'll ever fix bug 98413 either. + MOZ_ASSERT(!mCatalogData || !mCatalogData->mAgentSheet, + "Need to add back support for catalog style sheets"); + + // Note: mInternalSubset already doesn't include the [] around it. + nsresult rv = mSink->HandleDoctypeDecl(mInternalSubset, mDoctypeName, + mSystemID, mPublicID, data); + MaybeStopParser(rv); + } + + mInternalSubset.SetCapacity(0); + + return NS_OK; +} + +static nsresult +ExternalDTDStreamReaderFunc(nsIUnicharInputStream* aIn, + void* aClosure, + const char16_t* aFromSegment, + uint32_t aToOffset, + uint32_t aCount, + uint32_t *aWriteCount) +{ + // Pass the buffer to expat for parsing. + if (XML_Parse((XML_Parser)aClosure, (const char *)aFromSegment, + aCount * sizeof(char16_t), 0) == XML_STATUS_OK) { + *aWriteCount = aCount; + + return NS_OK; + } + + *aWriteCount = 0; + + return NS_ERROR_FAILURE; +} + +int +nsExpatDriver::HandleExternalEntityRef(const char16_t *openEntityNames, + const char16_t *base, + const char16_t *systemId, + const char16_t *publicId) +{ + if (mInInternalSubset && !mInExternalDTD && openEntityNames) { + mInternalSubset.Append(char16_t('%')); + mInternalSubset.Append(nsDependentString(openEntityNames)); + mInternalSubset.Append(char16_t(';')); + } + + // Load the external entity into a buffer. + nsCOMPtr<nsIInputStream> in; + nsAutoString absURL; + nsresult rv = OpenInputStreamFromExternalDTD(publicId, systemId, base, + getter_AddRefs(in), absURL); + if (NS_FAILED(rv)) { +#ifdef DEBUG + nsCString message("Failed to open external DTD: publicId \""); + AppendUTF16toUTF8(publicId, message); + message += "\" systemId \""; + AppendUTF16toUTF8(systemId, message); + message += "\" base \""; + AppendUTF16toUTF8(base, message); + message += "\" URL \""; + AppendUTF16toUTF8(absURL, message); + message += "\""; + NS_WARNING(message.get()); +#endif + return 1; + } + + nsCOMPtr<nsIUnicharInputStream> uniIn; + rv = NS_NewUnicharInputStream(in, getter_AddRefs(uniIn)); + NS_ENSURE_SUCCESS(rv, 1); + + int result = 1; + if (uniIn) { + XML_Parser entParser = XML_ExternalEntityParserCreate(mExpatParser, 0, + kUTF16); + if (entParser) { + XML_SetBase(entParser, absURL.get()); + + mInExternalDTD = true; + + uint32_t totalRead; + do { + rv = uniIn->ReadSegments(ExternalDTDStreamReaderFunc, entParser, + uint32_t(-1), &totalRead); + } while (NS_SUCCEEDED(rv) && totalRead > 0); + + result = XML_Parse(entParser, nullptr, 0, 1); + + mInExternalDTD = false; + + XML_ParserFree(entParser); + } + } + + return result; +} + +nsresult +nsExpatDriver::OpenInputStreamFromExternalDTD(const char16_t* aFPIStr, + const char16_t* aURLStr, + const char16_t* aBaseURL, + nsIInputStream** aStream, + nsAString& aAbsURL) +{ + nsCOMPtr<nsIURI> baseURI; + nsresult rv = NS_NewURI(getter_AddRefs(baseURI), + NS_ConvertUTF16toUTF8(aBaseURL)); + NS_ENSURE_SUCCESS(rv, rv); + + nsCOMPtr<nsIURI> uri; + rv = NS_NewURI(getter_AddRefs(uri), NS_ConvertUTF16toUTF8(aURLStr), nullptr, + baseURI); + NS_ENSURE_SUCCESS(rv, rv); + + // make sure the URI is allowed to be loaded in sync + bool isUIResource = false; + rv = NS_URIChainHasFlags(uri, nsIProtocolHandler::URI_IS_UI_RESOURCE, + &isUIResource); + NS_ENSURE_SUCCESS(rv, rv); + + nsCOMPtr<nsIURI> localURI; + if (!isUIResource) { + // Check to see if we can map the DTD to a known local DTD, or if a DTD + // file of the same name exists in the special DTD directory + if (aFPIStr) { + // see if the Formal Public Identifier (FPI) maps to a catalog entry + mCatalogData = LookupCatalogData(aFPIStr); + GetLocalDTDURI(mCatalogData, uri, getter_AddRefs(localURI)); + } + if (!localURI) { + return NS_ERROR_NOT_IMPLEMENTED; + } + } + + nsCOMPtr<nsIChannel> channel; + if (localURI) { + localURI.swap(uri); + rv = NS_NewChannel(getter_AddRefs(channel), + uri, + nsContentUtils::GetSystemPrincipal(), + nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_DATA_IS_NULL, + nsIContentPolicy::TYPE_DTD); + } + else { + NS_ASSERTION(mSink == nsCOMPtr<nsIExpatSink>(do_QueryInterface(mOriginalSink)), + "In nsExpatDriver::OpenInputStreamFromExternalDTD: " + "mOriginalSink not the same object as mSink?"); + nsCOMPtr<nsIPrincipal> loadingPrincipal; + if (mOriginalSink) { + nsCOMPtr<nsIDocument> doc; + doc = do_QueryInterface(mOriginalSink->GetTarget()); + if (doc) { + loadingPrincipal = doc->NodePrincipal(); + } + } + if (!loadingPrincipal) { + loadingPrincipal = nsNullPrincipal::Create(); + } + rv = NS_NewChannel(getter_AddRefs(channel), + uri, + loadingPrincipal, + nsILoadInfo::SEC_ALLOW_CROSS_ORIGIN_DATA_INHERITS | + nsILoadInfo::SEC_ALLOW_CHROME, + nsIContentPolicy::TYPE_DTD); + } + NS_ENSURE_SUCCESS(rv, rv); + + nsAutoCString absURL; + rv = uri->GetSpec(absURL); + NS_ENSURE_SUCCESS(rv, rv); + CopyUTF8toUTF16(absURL, aAbsURL); + + channel->SetContentType(NS_LITERAL_CSTRING("application/xml")); + return channel->Open2(aStream); +} + +static nsresult +CreateErrorText(const char16_t* aDescription, + const char16_t* aSourceURL, + const uint32_t aLineNumber, + const uint32_t aColNumber, + nsString& aErrorString) +{ + aErrorString.Truncate(); + + nsAutoString msg; + nsresult rv = + nsParserMsgUtils::GetLocalizedStringByName(XMLPARSER_PROPERTIES, + "XMLParsingError", msg); + NS_ENSURE_SUCCESS(rv, rv); + + // XML Parsing Error: %1$S\nLocation: %2$S\nLine Number %3$u, Column %4$u: + char16_t *message = nsTextFormatter::smprintf(msg.get(), aDescription, + aSourceURL, aLineNumber, + aColNumber); + if (!message) { + return NS_ERROR_OUT_OF_MEMORY; + } + + aErrorString.Assign(message); + nsTextFormatter::smprintf_free(message); + + return NS_OK; +} + +static nsresult +AppendErrorPointer(const int32_t aColNumber, + const char16_t *aSourceLine, + nsString& aSourceString) +{ + aSourceString.Append(char16_t('\n')); + + // Last character will be '^'. + int32_t last = aColNumber - 1; + int32_t i; + uint32_t minuses = 0; + for (i = 0; i < last; ++i) { + if (aSourceLine[i] == '\t') { + // Since this uses |white-space: pre;| a tab stop equals 8 spaces. + uint32_t add = 8 - (minuses % 8); + aSourceString.AppendASCII("--------", add); + minuses += add; + } + else { + aSourceString.Append(char16_t('-')); + ++minuses; + } + } + aSourceString.Append(char16_t('^')); + + return NS_OK; +} + +nsresult +nsExpatDriver::HandleError() +{ + int32_t code = XML_GetErrorCode(mExpatParser); + NS_ASSERTION(code > XML_ERROR_NONE, "unexpected XML error code"); + + // Map Expat error code to an error string + // XXX Deal with error returns. + nsAutoString description; + nsParserMsgUtils::GetLocalizedStringByID(XMLPARSER_PROPERTIES, code, + description); + + if (code == XML_ERROR_TAG_MISMATCH) { + /** + * Expat can send the following: + * localName + * namespaceURI<separator>localName + * namespaceURI<separator>localName<separator>prefix + * + * and we use 0xFFFF for the <separator>. + * + */ + const char16_t *mismatch = MOZ_XML_GetMismatchedTag(mExpatParser); + const char16_t *uriEnd = nullptr; + const char16_t *nameEnd = nullptr; + const char16_t *pos; + for (pos = mismatch; *pos; ++pos) { + if (*pos == kExpatSeparatorChar) { + if (uriEnd) { + nameEnd = pos; + } + else { + uriEnd = pos; + } + } + } + + nsAutoString tagName; + if (uriEnd && nameEnd) { + // We have a prefix. + tagName.Append(nameEnd + 1, pos - nameEnd - 1); + tagName.Append(char16_t(':')); + } + const char16_t *nameStart = uriEnd ? uriEnd + 1 : mismatch; + tagName.Append(nameStart, (nameEnd ? nameEnd : pos) - nameStart); + + nsAutoString msg; + nsParserMsgUtils::GetLocalizedStringByName(XMLPARSER_PROPERTIES, + "Expected", msg); + + // . Expected: </%S>. + char16_t *message = nsTextFormatter::smprintf(msg.get(), tagName.get()); + if (!message) { + return NS_ERROR_OUT_OF_MEMORY; + } + + description.Append(message); + + nsTextFormatter::smprintf_free(message); + } + + // Adjust the column number so that it is one based rather than zero based. + uint32_t colNumber = XML_GetCurrentColumnNumber(mExpatParser) + 1; + uint32_t lineNumber = XML_GetCurrentLineNumber(mExpatParser); + + nsAutoString errorText; + CreateErrorText(description.get(), XML_GetBase(mExpatParser), lineNumber, + colNumber, errorText); + + NS_ASSERTION(mSink, "no sink?"); + + nsAutoString sourceText(mLastLine); + AppendErrorPointer(colNumber, mLastLine.get(), sourceText); + + // Try to create and initialize the script error. + nsCOMPtr<nsIScriptError> serr(do_CreateInstance(NS_SCRIPTERROR_CONTRACTID)); + nsresult rv = NS_ERROR_FAILURE; + if (serr) { + rv = serr->InitWithWindowID(errorText, + mURISpec, + mLastLine, + lineNumber, colNumber, + nsIScriptError::errorFlag, "malformed-xml", + mInnerWindowID); + } + + // If it didn't initialize, we can't do any logging. + bool shouldReportError = NS_SUCCEEDED(rv); + + if (mSink && shouldReportError) { + rv = mSink->ReportError(errorText.get(), + sourceText.get(), + serr, + &shouldReportError); + if (NS_FAILED(rv)) { + shouldReportError = true; + } + } + + if (mOriginalSink) { + nsCOMPtr<nsIDocument> doc = do_QueryInterface(mOriginalSink->GetTarget()); + if (doc && doc->SuppressParserErrorConsoleMessages()) { + shouldReportError = false; + } + } + + if (shouldReportError) { + nsCOMPtr<nsIConsoleService> cs + (do_GetService(NS_CONSOLESERVICE_CONTRACTID)); + if (cs) { + cs->LogMessage(serr); + } + } + + return NS_ERROR_HTMLPARSER_STOPPARSING; +} + +void +nsExpatDriver::ParseBuffer(const char16_t *aBuffer, + uint32_t aLength, + bool aIsFinal, + uint32_t *aConsumed) +{ + NS_ASSERTION((aBuffer && aLength != 0) || (!aBuffer && aLength == 0), "?"); + NS_ASSERTION(mInternalState != NS_OK || aIsFinal || aBuffer, + "Useless call, we won't call Expat"); + NS_PRECONDITION(!BlockedOrInterrupted() || !aBuffer, + "Non-null buffer when resuming"); + NS_PRECONDITION(XML_GetCurrentByteIndex(mExpatParser) % sizeof(char16_t) == 0, + "Consumed part of a char16_t?"); + + if (mExpatParser && (mInternalState == NS_OK || BlockedOrInterrupted())) { + int32_t parserBytesBefore = XML_GetCurrentByteIndex(mExpatParser); + NS_ASSERTION(parserBytesBefore >= 0, "Unexpected value"); + + XML_Status status; + if (BlockedOrInterrupted()) { + mInternalState = NS_OK; // Resume in case we're blocked. + status = XML_ResumeParser(mExpatParser); + } + else { + status = XML_Parse(mExpatParser, + reinterpret_cast<const char*>(aBuffer), + aLength * sizeof(char16_t), aIsFinal); + } + + int32_t parserBytesConsumed = XML_GetCurrentByteIndex(mExpatParser); + + NS_ASSERTION(parserBytesConsumed >= 0, "Unexpected value"); + NS_ASSERTION(parserBytesConsumed >= parserBytesBefore, + "How'd this happen?"); + NS_ASSERTION(parserBytesConsumed % sizeof(char16_t) == 0, + "Consumed part of a char16_t?"); + + // Consumed something. + *aConsumed = (parserBytesConsumed - parserBytesBefore) / sizeof(char16_t); + NS_ASSERTION(*aConsumed <= aLength + mExpatBuffered, + "Too many bytes consumed?"); + + NS_ASSERTION(status != XML_STATUS_SUSPENDED || BlockedOrInterrupted(), + "Inconsistent expat suspension state."); + + if (status == XML_STATUS_ERROR) { + mInternalState = NS_ERROR_HTMLPARSER_STOPPARSING; + } + } + else { + *aConsumed = 0; + } +} + +NS_IMETHODIMP +nsExpatDriver::ConsumeToken(nsScanner& aScanner, bool& aFlushTokens) +{ + // We keep the scanner pointing to the position where Expat will start + // parsing. + nsScannerIterator currentExpatPosition; + aScanner.CurrentPosition(currentExpatPosition); + + // This is the start of the first buffer that we need to pass to Expat. + nsScannerIterator start = currentExpatPosition; + start.advance(mExpatBuffered); + + // This is the end of the last buffer (at this point, more data could come in + // later). + nsScannerIterator end; + aScanner.EndReading(end); + + MOZ_LOG(gExpatDriverLog, LogLevel::Debug, + ("Remaining in expat's buffer: %i, remaining in scanner: %i.", + mExpatBuffered, Distance(start, end))); + + // We want to call Expat if we have more buffers, or if we know there won't + // be more buffers (and so we want to flush the remaining data), or if we're + // currently blocked and there's data in Expat's buffer. + while (start != end || (mIsFinalChunk && !mMadeFinalCallToExpat) || + (BlockedOrInterrupted() && mExpatBuffered > 0)) { + bool noMoreBuffers = start == end && mIsFinalChunk; + bool blocked = BlockedOrInterrupted(); + + const char16_t *buffer; + uint32_t length; + if (blocked || noMoreBuffers) { + // If we're blocked we just resume Expat so we don't need a buffer, if + // there aren't any more buffers we pass a null buffer to Expat. + buffer = nullptr; + length = 0; + + if (blocked) { + MOZ_LOG(gExpatDriverLog, LogLevel::Debug, + ("Resuming Expat, will parse data remaining in Expat's " + "buffer.\nContent of Expat's buffer:\n-----\n%s\n-----\n", + NS_ConvertUTF16toUTF8(currentExpatPosition.get(), + mExpatBuffered).get())); + } + else { + NS_ASSERTION(mExpatBuffered == Distance(currentExpatPosition, end), + "Didn't pass all the data to Expat?"); + MOZ_LOG(gExpatDriverLog, LogLevel::Debug, + ("Last call to Expat, will parse data remaining in Expat's " + "buffer.\nContent of Expat's buffer:\n-----\n%s\n-----\n", + NS_ConvertUTF16toUTF8(currentExpatPosition.get(), + mExpatBuffered).get())); + } + } + else { + buffer = start.get(); + length = uint32_t(start.size_forward()); + + MOZ_LOG(gExpatDriverLog, LogLevel::Debug, + ("Calling Expat, will parse data remaining in Expat's buffer and " + "new data.\nContent of Expat's buffer:\n-----\n%s\n-----\nNew " + "data:\n-----\n%s\n-----\n", + NS_ConvertUTF16toUTF8(currentExpatPosition.get(), + mExpatBuffered).get(), + NS_ConvertUTF16toUTF8(start.get(), length).get())); + } + + uint32_t consumed; + ParseBuffer(buffer, length, noMoreBuffers, &consumed); + if (consumed > 0) { + nsScannerIterator oldExpatPosition = currentExpatPosition; + currentExpatPosition.advance(consumed); + + // We consumed some data, we want to store the last line of data that + // was consumed in case we run into an error (to show the line in which + // the error occurred). + + // The length of the last line that Expat has parsed. + XML_Size lastLineLength = XML_GetCurrentColumnNumber(mExpatParser); + + if (lastLineLength <= consumed) { + // The length of the last line was less than what expat consumed, so + // there was at least one line break in the consumed data. Store the + // last line until the point where we stopped parsing. + nsScannerIterator startLastLine = currentExpatPosition; + startLastLine.advance(-((ptrdiff_t)lastLineLength)); + if (!CopyUnicodeTo(startLastLine, currentExpatPosition, mLastLine)) { + return (mInternalState = NS_ERROR_OUT_OF_MEMORY); + } + } + else { + // There was no line break in the consumed data, append the consumed + // data. + if (!AppendUnicodeTo(oldExpatPosition, + currentExpatPosition, + mLastLine)) { + return (mInternalState = NS_ERROR_OUT_OF_MEMORY); + } + } + } + + mExpatBuffered += length - consumed; + + if (BlockedOrInterrupted()) { + MOZ_LOG(gExpatDriverLog, LogLevel::Debug, + ("Blocked or interrupted parser (probably for loading linked " + "stylesheets or scripts).")); + + aScanner.SetPosition(currentExpatPosition, true); + aScanner.Mark(); + + return mInternalState; + } + + if (noMoreBuffers && mExpatBuffered == 0) { + mMadeFinalCallToExpat = true; + } + + if (NS_FAILED(mInternalState)) { + if (XML_GetErrorCode(mExpatParser) != XML_ERROR_NONE) { + NS_ASSERTION(mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING, + "Unexpected error"); + + // Look for the next newline after the last one we consumed + nsScannerIterator lastLine = currentExpatPosition; + while (lastLine != end) { + length = uint32_t(lastLine.size_forward()); + uint32_t endOffset = 0; + const char16_t *buffer = lastLine.get(); + while (endOffset < length && buffer[endOffset] != '\n' && + buffer[endOffset] != '\r') { + ++endOffset; + } + mLastLine.Append(Substring(buffer, buffer + endOffset)); + if (endOffset < length) { + // We found a newline. + break; + } + + lastLine.advance(length); + } + + HandleError(); + } + + return mInternalState; + } + + // Either we have more buffers, or we were blocked (and we'll flush in the + // next iteration), or we should have emptied Expat's buffer. + NS_ASSERTION(!noMoreBuffers || blocked || + (mExpatBuffered == 0 && currentExpatPosition == end), + "Unreachable data left in Expat's buffer"); + + start.advance(length); + + // It's possible for start to have passed end if we received more data + // (e.g. if we spun the event loop in an inline script). Reload end now + // to compensate. + aScanner.EndReading(end); + } + + aScanner.SetPosition(currentExpatPosition, true); + aScanner.Mark(); + + MOZ_LOG(gExpatDriverLog, LogLevel::Debug, + ("Remaining in expat's buffer: %i, remaining in scanner: %i.", + mExpatBuffered, Distance(currentExpatPosition, end))); + + return NS_SUCCEEDED(mInternalState) ? kEOF : NS_OK; +} + +NS_IMETHODIMP +nsExpatDriver::WillBuildModel(const CParserContext& aParserContext, + nsITokenizer* aTokenizer, + nsIContentSink* aSink) +{ + mSink = do_QueryInterface(aSink); + if (!mSink) { + NS_ERROR("nsExpatDriver didn't get an nsIExpatSink"); + // Make sure future calls to us bail out as needed + mInternalState = NS_ERROR_UNEXPECTED; + return mInternalState; + } + + mOriginalSink = aSink; + + static const XML_Memory_Handling_Suite memsuite = + { + (void *(*)(size_t))PR_Malloc, + (void *(*)(void *, size_t))PR_Realloc, + PR_Free + }; + + static const char16_t kExpatSeparator[] = { kExpatSeparatorChar, '\0' }; + + mExpatParser = XML_ParserCreate_MM(kUTF16, &memsuite, kExpatSeparator); + NS_ENSURE_TRUE(mExpatParser, NS_ERROR_FAILURE); + + XML_SetReturnNSTriplet(mExpatParser, XML_TRUE); + +#ifdef XML_DTD + XML_SetParamEntityParsing(mExpatParser, XML_PARAM_ENTITY_PARSING_ALWAYS); +#endif + + mURISpec = aParserContext.mScanner->GetFilename(); + + XML_SetBase(mExpatParser, mURISpec.get()); + + nsCOMPtr<nsIDocument> doc = do_QueryInterface(mOriginalSink->GetTarget()); + if (doc) { + nsCOMPtr<nsPIDOMWindowOuter> win = doc->GetWindow(); + nsCOMPtr<nsPIDOMWindowInner> inner; + if (win) { + inner = win->GetCurrentInnerWindow(); + } else { + bool aHasHadScriptHandlingObject; + nsIScriptGlobalObject *global = + doc->GetScriptHandlingObject(aHasHadScriptHandlingObject); + if (global) { + inner = do_QueryInterface(global); + } + } + if (inner) { + mInnerWindowID = inner->WindowID(); + } + } + + // Set up the callbacks + XML_SetXmlDeclHandler(mExpatParser, Driver_HandleXMLDeclaration); + XML_SetElementHandler(mExpatParser, Driver_HandleStartElement, + Driver_HandleEndElement); + XML_SetCharacterDataHandler(mExpatParser, Driver_HandleCharacterData); + XML_SetProcessingInstructionHandler(mExpatParser, + Driver_HandleProcessingInstruction); + XML_SetDefaultHandlerExpand(mExpatParser, Driver_HandleDefault); + XML_SetExternalEntityRefHandler(mExpatParser, + (XML_ExternalEntityRefHandler) + Driver_HandleExternalEntityRef); + XML_SetExternalEntityRefHandlerArg(mExpatParser, this); + XML_SetCommentHandler(mExpatParser, Driver_HandleComment); + XML_SetCdataSectionHandler(mExpatParser, Driver_HandleStartCdataSection, + Driver_HandleEndCdataSection); + + XML_SetParamEntityParsing(mExpatParser, + XML_PARAM_ENTITY_PARSING_UNLESS_STANDALONE); + XML_SetDoctypeDeclHandler(mExpatParser, Driver_HandleStartDoctypeDecl, + Driver_HandleEndDoctypeDecl); + + // If the sink is an nsIExtendedExpatSink, + // register some addtional handlers. + mExtendedSink = do_QueryInterface(mSink); + if (mExtendedSink) { + XML_SetNamespaceDeclHandler(mExpatParser, + Driver_HandleStartNamespaceDecl, + Driver_HandleEndNamespaceDecl); + XML_SetUnparsedEntityDeclHandler(mExpatParser, + Driver_HandleUnparsedEntityDecl); + XML_SetNotationDeclHandler(mExpatParser, + Driver_HandleNotationDecl); + } + + // Set up the user data. + XML_SetUserData(mExpatParser, this); + + return mInternalState; +} + +NS_IMETHODIMP +nsExpatDriver::BuildModel(nsITokenizer* aTokenizer, nsIContentSink* aSink) +{ + return mInternalState; +} + +NS_IMETHODIMP +nsExpatDriver::DidBuildModel(nsresult anErrorCode) +{ + mOriginalSink = nullptr; + mSink = nullptr; + mExtendedSink = nullptr; + return NS_OK; +} + +NS_IMETHODIMP +nsExpatDriver::WillTokenize(bool aIsFinalChunk) +{ + mIsFinalChunk = aIsFinalChunk; + return NS_OK; +} + +NS_IMETHODIMP_(void) +nsExpatDriver::Terminate() +{ + // XXX - not sure what happens to the unparsed data. + if (mExpatParser) { + XML_StopParser(mExpatParser, XML_FALSE); + } + mInternalState = NS_ERROR_HTMLPARSER_STOPPARSING; +} + +NS_IMETHODIMP_(int32_t) +nsExpatDriver::GetType() +{ + return NS_IPARSER_FLAG_XML; +} + +NS_IMETHODIMP_(nsDTDMode) +nsExpatDriver::GetMode() const +{ + return eDTDMode_full_standards; +} + +/*************************** Unused methods **********************************/ + +NS_IMETHODIMP_(bool) +nsExpatDriver::IsContainer(int32_t aTag) const +{ + return true; +} + +NS_IMETHODIMP_(bool) +nsExpatDriver::CanContain(int32_t aParent,int32_t aChild) const +{ + return true; +} + +void +nsExpatDriver::MaybeStopParser(nsresult aState) +{ + if (NS_FAILED(aState)) { + // If we had a failure we want to override NS_ERROR_HTMLPARSER_INTERRUPTED + // and we want to override NS_ERROR_HTMLPARSER_BLOCK but not with + // NS_ERROR_HTMLPARSER_INTERRUPTED. + if (NS_SUCCEEDED(mInternalState) || + mInternalState == NS_ERROR_HTMLPARSER_INTERRUPTED || + (mInternalState == NS_ERROR_HTMLPARSER_BLOCK && + aState != NS_ERROR_HTMLPARSER_INTERRUPTED)) { + mInternalState = (aState == NS_ERROR_HTMLPARSER_INTERRUPTED || + aState == NS_ERROR_HTMLPARSER_BLOCK) ? + aState : + NS_ERROR_HTMLPARSER_STOPPARSING; + } + + // If we get an error then we need to stop Expat (by calling XML_StopParser + // with false as the last argument). If the parser should be blocked or + // interrupted we need to pause Expat (by calling XML_StopParser with + // true as the last argument). + XML_StopParser(mExpatParser, BlockedOrInterrupted()); + } + else if (NS_SUCCEEDED(mInternalState)) { + // Only clobber mInternalState with the success code if we didn't block or + // interrupt before. + mInternalState = aState; + } +} diff --git a/components/htmlparser/src/nsExpatDriver.h b/components/htmlparser/src/nsExpatDriver.h new file mode 100644 index 000000000..988409cfe --- /dev/null +++ b/components/htmlparser/src/nsExpatDriver.h @@ -0,0 +1,145 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef NS_EXPAT_DRIVER__ +#define NS_EXPAT_DRIVER__ + +#include "expat_config.h" +#include "expat.h" +#include "nsCOMPtr.h" +#include "nsString.h" +#include "nsIDTD.h" +#include "nsITokenizer.h" +#include "nsIInputStream.h" +#include "nsIParser.h" +#include "nsCycleCollectionParticipant.h" + +class nsIExpatSink; +class nsIExtendedExpatSink; +struct nsCatalogData; + +class nsExpatDriver : public nsIDTD, + public nsITokenizer +{ + virtual ~nsExpatDriver(); + +public: + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_NSIDTD + NS_DECL_NSITOKENIZER + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsExpatDriver, nsIDTD) + + nsExpatDriver(); + + int HandleExternalEntityRef(const char16_t *aOpenEntityNames, + const char16_t *aBase, + const char16_t *aSystemId, + const char16_t *aPublicId); + void HandleStartElement(const char16_t *aName, const char16_t **aAtts); + nsresult HandleEndElement(const char16_t *aName); + nsresult HandleCharacterData(const char16_t *aCData, const uint32_t aLength); + nsresult HandleComment(const char16_t *aName); + nsresult HandleProcessingInstruction(const char16_t *aTarget, + const char16_t *aData); + nsresult HandleXMLDeclaration(const char16_t *aVersion, + const char16_t *aEncoding, + int32_t aStandalone); + nsresult HandleDefault(const char16_t *aData, const uint32_t aLength); + nsresult HandleStartCdataSection(); + nsresult HandleEndCdataSection(); + nsresult HandleStartDoctypeDecl(const char16_t* aDoctypeName, + const char16_t* aSysid, + const char16_t* aPubid, + bool aHasInternalSubset); + nsresult HandleEndDoctypeDecl(); + nsresult HandleStartNamespaceDecl(const char16_t* aPrefix, + const char16_t* aUri); + nsresult HandleEndNamespaceDecl(const char16_t* aPrefix); + nsresult HandleNotationDecl(const char16_t* aNotationName, + const char16_t* aBase, + const char16_t* aSysid, + const char16_t* aPubid); + nsresult HandleUnparsedEntityDecl(const char16_t* aEntityName, + const char16_t* aBase, + const char16_t* aSysid, + const char16_t* aPubid, + const char16_t* aNotationName); + +private: + // Load up an external stream to get external entity information + nsresult OpenInputStreamFromExternalDTD(const char16_t* aFPIStr, + const char16_t* aURLStr, + const char16_t* aBaseURL, + nsIInputStream** aStream, + nsAString& aAbsURL); + + /** + * Pass a buffer to Expat. If Expat is blocked aBuffer should be null and + * aLength should be 0. The result of the call will be stored in + * mInternalState. Expat will parse as much of the buffer as it can and store + * the rest in its internal buffer. + * + * @param aBuffer the buffer to pass to Expat. May be null. + * @param aLength the length of the buffer to pass to Expat (in number of + * char16_t's). Must be 0 if aBuffer is null and > 0 if + * aBuffer is not null. + * @param aIsFinal whether there will definitely not be any more new buffers + * passed in to ParseBuffer + * @param aConsumed [out] the number of PRUnichars that Expat consumed. This + * doesn't include the PRUnichars that Expat stored in + * its buffer but didn't parse yet. + */ + void ParseBuffer(const char16_t *aBuffer, uint32_t aLength, bool aIsFinal, + uint32_t *aConsumed); + nsresult HandleError(); + + void MaybeStopParser(nsresult aState); + + bool BlockedOrInterrupted() + { + return mInternalState == NS_ERROR_HTMLPARSER_BLOCK || + mInternalState == NS_ERROR_HTMLPARSER_INTERRUPTED; + } + + XML_Parser mExpatParser; + nsString mLastLine; + nsString mCDataText; + // Various parts of a doctype + nsString mDoctypeName; + nsString mSystemID; + nsString mPublicID; + nsString mInternalSubset; + bool mInCData; + bool mInInternalSubset; + bool mInExternalDTD; + bool mMadeFinalCallToExpat; + + // Whether we're sure that we won't be getting more buffers to parse from + // Necko + bool mIsFinalChunk; + + // The depth of nested parsing we are currently at + uint16_t mTagDepth; + + nsresult mInternalState; + + // The length of the data in Expat's buffer (in number of PRUnichars). + uint32_t mExpatBuffered; + + // These sinks all refer the same conceptual object. mOriginalSink is + // identical with the nsIContentSink* passed to WillBuildModel, and exists + // only to avoid QI-ing back to nsIContentSink*. + nsCOMPtr<nsIContentSink> mOriginalSink; + nsCOMPtr<nsIExpatSink> mSink; + nsCOMPtr<nsIExtendedExpatSink> mExtendedSink; + + const nsCatalogData* mCatalogData; // weak + nsString mURISpec; + + // Used for error reporting. + uint64_t mInnerWindowID; +}; + +#endif diff --git a/components/htmlparser/src/nsHTMLEntities.cpp b/components/htmlparser/src/nsHTMLEntities.cpp new file mode 100644 index 000000000..e8365c21f --- /dev/null +++ b/components/htmlparser/src/nsHTMLEntities.cpp @@ -0,0 +1,205 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "mozilla/ArrayUtils.h" + +#include "nsHTMLEntities.h" + +#include "nsString.h" +#include "nsCRT.h" +#include "PLDHashTable.h" + +using namespace mozilla; + +struct EntityNode { + const char* mStr; // never owns buffer + int32_t mUnicode; +}; + +struct EntityNodeEntry : public PLDHashEntryHdr +{ + const EntityNode* node; +}; + +static bool matchNodeString(const PLDHashEntryHdr* aHdr, const void* key) +{ + const EntityNodeEntry* entry = static_cast<const EntityNodeEntry*>(aHdr); + const char* str = static_cast<const char*>(key); + return (nsCRT::strcmp(entry->node->mStr, str) == 0); +} + +static bool matchNodeUnicode(const PLDHashEntryHdr* aHdr, const void* key) +{ + const EntityNodeEntry* entry = static_cast<const EntityNodeEntry*>(aHdr); + const int32_t ucode = NS_PTR_TO_INT32(key); + return (entry->node->mUnicode == ucode); +} + +static PLDHashNumber hashUnicodeValue(const void* key) +{ + // key is actually the unicode value + return PLDHashNumber(NS_PTR_TO_INT32(key)); +} + + +static const PLDHashTableOps EntityToUnicodeOps = { + PLDHashTable::HashStringKey, + matchNodeString, + PLDHashTable::MoveEntryStub, + PLDHashTable::ClearEntryStub, + nullptr, +}; + +static const PLDHashTableOps UnicodeToEntityOps = { + hashUnicodeValue, + matchNodeUnicode, + PLDHashTable::MoveEntryStub, + PLDHashTable::ClearEntryStub, + nullptr, +}; + +static PLDHashTable* gEntityToUnicode; +static PLDHashTable* gUnicodeToEntity; +static nsrefcnt gTableRefCnt = 0; + +#define HTML_ENTITY(_name, _value) { #_name, _value }, +static const EntityNode gEntityArray[] = { +#include "nsHTMLEntityList.h" +}; +#undef HTML_ENTITY + +#define NS_HTML_ENTITY_COUNT ((int32_t)ArrayLength(gEntityArray)) + +nsresult +nsHTMLEntities::AddRefTable(void) +{ + if (!gTableRefCnt) { + gEntityToUnicode = new PLDHashTable(&EntityToUnicodeOps, + sizeof(EntityNodeEntry), + NS_HTML_ENTITY_COUNT); + gUnicodeToEntity = new PLDHashTable(&UnicodeToEntityOps, + sizeof(EntityNodeEntry), + NS_HTML_ENTITY_COUNT); + for (const EntityNode *node = gEntityArray, + *node_end = ArrayEnd(gEntityArray); + node < node_end; ++node) { + + // add to Entity->Unicode table + auto entry = static_cast<EntityNodeEntry*> + (gEntityToUnicode->Add(node->mStr, fallible)); + NS_ASSERTION(entry, "Error adding an entry"); + // Prefer earlier entries when we have duplication. + if (!entry->node) + entry->node = node; + + // add to Unicode->Entity table + entry = static_cast<EntityNodeEntry*> + (gUnicodeToEntity->Add(NS_INT32_TO_PTR(node->mUnicode), + fallible)); + NS_ASSERTION(entry, "Error adding an entry"); + // Prefer earlier entries when we have duplication. + if (!entry->node) + entry->node = node; + } +#ifdef DEBUG + gUnicodeToEntity->MarkImmutable(); + gEntityToUnicode->MarkImmutable(); +#endif + } + ++gTableRefCnt; + return NS_OK; +} + +void +nsHTMLEntities::ReleaseTable(void) +{ + if (--gTableRefCnt != 0) { + return; + } + + delete gEntityToUnicode; + delete gUnicodeToEntity; + gEntityToUnicode = nullptr; + gUnicodeToEntity = nullptr; +} + +int32_t +nsHTMLEntities::EntityToUnicode(const nsCString& aEntity) +{ + NS_ASSERTION(gEntityToUnicode, "no lookup table, needs addref"); + if (!gEntityToUnicode) { + return -1; + } + + //this little piece of code exists because entities may or may not have the terminating ';'. + //if we see it, strip if off for this test... + + if(';'==aEntity.Last()) { + nsAutoCString temp(aEntity); + temp.Truncate(aEntity.Length()-1); + return EntityToUnicode(temp); + } + + auto entry = + static_cast<EntityNodeEntry*>(gEntityToUnicode->Search(aEntity.get())); + + return entry ? entry->node->mUnicode : -1; +} + + +int32_t +nsHTMLEntities::EntityToUnicode(const nsAString& aEntity) { + nsAutoCString theEntity; theEntity.AssignWithConversion(aEntity); + if(';'==theEntity.Last()) { + theEntity.Truncate(theEntity.Length()-1); + } + + return EntityToUnicode(theEntity); +} + + +const char* +nsHTMLEntities::UnicodeToEntity(int32_t aUnicode) +{ + NS_ASSERTION(gUnicodeToEntity, "no lookup table, needs addref"); + auto entry = + static_cast<EntityNodeEntry*> + (gUnicodeToEntity->Search(NS_INT32_TO_PTR(aUnicode))); + + return entry ? entry->node->mStr : nullptr; +} + +#ifdef DEBUG +#include <stdio.h> + +class nsTestEntityTable { +public: + nsTestEntityTable() { + int32_t value; + nsHTMLEntities::AddRefTable(); + + // Make sure we can find everything we are supposed to + for (int i = 0; i < NS_HTML_ENTITY_COUNT; ++i) { + nsAutoString entity; entity.AssignWithConversion(gEntityArray[i].mStr); + + value = nsHTMLEntities::EntityToUnicode(entity); + NS_ASSERTION(value != -1, "can't find entity"); + NS_ASSERTION(value == gEntityArray[i].mUnicode, "bad unicode value"); + + entity.AssignWithConversion(nsHTMLEntities::UnicodeToEntity(value)); + NS_ASSERTION(entity.EqualsASCII(gEntityArray[i].mStr), "bad entity name"); + } + + // Make sure we don't find things that aren't there + value = nsHTMLEntities::EntityToUnicode(nsAutoCString("@")); + NS_ASSERTION(value == -1, "found @"); + value = nsHTMLEntities::EntityToUnicode(nsAutoCString("zzzzz")); + NS_ASSERTION(value == -1, "found zzzzz"); + nsHTMLEntities::ReleaseTable(); + } +}; +//nsTestEntityTable validateEntityTable; +#endif + diff --git a/components/htmlparser/src/nsHTMLEntities.h b/components/htmlparser/src/nsHTMLEntities.h new file mode 100644 index 000000000..f38856bfa --- /dev/null +++ b/components/htmlparser/src/nsHTMLEntities.h @@ -0,0 +1,35 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsHTMLEntities_h___ +#define nsHTMLEntities_h___ + +#include "nsString.h" + +class nsHTMLEntities { +public: + + static nsresult AddRefTable(void); + static void ReleaseTable(void); + +/** + * Translate an entity string into it's unicode value. This call + * returns -1 if the entity cannot be mapped. Note that the string + * passed in must NOT have the leading "&" nor the trailing ";" + * in it. + */ + static int32_t EntityToUnicode(const nsAString& aEntity); + static int32_t EntityToUnicode(const nsCString& aEntity); + +/** + * Translate a unicode value into an entity string. This call + * returns null if the entity cannot be mapped. + * Note that the string returned DOES NOT have the leading "&" nor + * the trailing ";" in it. + */ + static const char* UnicodeToEntity(int32_t aUnicode); +}; + + +#endif /* nsHTMLEntities_h___ */ diff --git a/components/htmlparser/src/nsHTMLEntityList.h b/components/htmlparser/src/nsHTMLEntityList.h new file mode 100644 index 000000000..fa05382bf --- /dev/null +++ b/components/htmlparser/src/nsHTMLEntityList.h @@ -0,0 +1,303 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/****** + + This file contains the list of all HTML entities + See nsHTMLEntities.h for access to the enum values for entities + + It is designed to be used as inline input to nsHTMLEntities.cpp *only* + through the magic of C preprocessing. + + All entries must be enclosed in the macro HTML_ENTITY which will have cruel + and unusual things done to it + + It is recommended (but not strictly necessary) to keep all entries + in alphabetical order + + The first argument to HTML_ENTITY is the string value of the entity + The second argument it HTML_ENTITY is the unicode value of the entity + + ******/ + +// ISO 8859-1 entities. +// See the HTML4.0 spec for this list in it's DTD form +HTML_ENTITY(nbsp, 160) +HTML_ENTITY(iexcl, 161) +HTML_ENTITY(cent, 162) +HTML_ENTITY(pound, 163) +HTML_ENTITY(curren, 164) +HTML_ENTITY(yen, 165) +HTML_ENTITY(brvbar, 166) +HTML_ENTITY(sect, 167) +HTML_ENTITY(uml, 168) +HTML_ENTITY(copy, 169) +HTML_ENTITY(ordf, 170) +HTML_ENTITY(laquo, 171) +HTML_ENTITY(not, 172) +HTML_ENTITY(shy, 173) +HTML_ENTITY(reg, 174) +HTML_ENTITY(macr, 175) +HTML_ENTITY(deg, 176) +HTML_ENTITY(plusmn, 177) +HTML_ENTITY(sup2, 178) +HTML_ENTITY(sup3, 179) +HTML_ENTITY(acute, 180) +HTML_ENTITY(micro, 181) +HTML_ENTITY(para, 182) +HTML_ENTITY(middot, 183) +HTML_ENTITY(cedil, 184) +HTML_ENTITY(sup1, 185) +HTML_ENTITY(ordm, 186) +HTML_ENTITY(raquo, 187) +HTML_ENTITY(frac14, 188) +HTML_ENTITY(frac12, 189) +HTML_ENTITY(frac34, 190) +HTML_ENTITY(iquest, 191) +HTML_ENTITY(Agrave, 192) +HTML_ENTITY(Aacute, 193) +HTML_ENTITY(Acirc, 194) +HTML_ENTITY(Atilde, 195) +HTML_ENTITY(Auml, 196) +HTML_ENTITY(Aring, 197) +HTML_ENTITY(AElig, 198) +HTML_ENTITY(Ccedil, 199) +HTML_ENTITY(Egrave, 200) +HTML_ENTITY(Eacute, 201) +HTML_ENTITY(Ecirc, 202) +HTML_ENTITY(Euml, 203) +HTML_ENTITY(Igrave, 204) +HTML_ENTITY(Iacute, 205) +HTML_ENTITY(Icirc, 206) +HTML_ENTITY(Iuml, 207) +HTML_ENTITY(ETH, 208) +HTML_ENTITY(Ntilde, 209) +HTML_ENTITY(Ograve, 210) +HTML_ENTITY(Oacute, 211) +HTML_ENTITY(Ocirc, 212) +HTML_ENTITY(Otilde, 213) +HTML_ENTITY(Ouml, 214) +HTML_ENTITY(times, 215) +HTML_ENTITY(Oslash, 216) +HTML_ENTITY(Ugrave, 217) +HTML_ENTITY(Uacute, 218) +HTML_ENTITY(Ucirc, 219) +HTML_ENTITY(Uuml, 220) +HTML_ENTITY(Yacute, 221) +HTML_ENTITY(THORN, 222) +HTML_ENTITY(szlig, 223) +HTML_ENTITY(agrave, 224) +HTML_ENTITY(aacute, 225) +HTML_ENTITY(acirc, 226) +HTML_ENTITY(atilde, 227) +HTML_ENTITY(auml, 228) +HTML_ENTITY(aring, 229) +HTML_ENTITY(aelig, 230) +HTML_ENTITY(ccedil, 231) +HTML_ENTITY(egrave, 232) +HTML_ENTITY(eacute, 233) +HTML_ENTITY(ecirc, 234) +HTML_ENTITY(euml, 235) +HTML_ENTITY(igrave, 236) +HTML_ENTITY(iacute, 237) +HTML_ENTITY(icirc, 238) +HTML_ENTITY(iuml, 239) +HTML_ENTITY(eth, 240) +HTML_ENTITY(ntilde, 241) +HTML_ENTITY(ograve, 242) +HTML_ENTITY(oacute, 243) +HTML_ENTITY(ocirc, 244) +HTML_ENTITY(otilde, 245) +HTML_ENTITY(ouml, 246) +HTML_ENTITY(divide, 247) +HTML_ENTITY(oslash, 248) +HTML_ENTITY(ugrave, 249) +HTML_ENTITY(uacute, 250) +HTML_ENTITY(ucirc, 251) +HTML_ENTITY(uuml, 252) +HTML_ENTITY(yacute, 253) +HTML_ENTITY(thorn, 254) +HTML_ENTITY(yuml, 255) + +// Symbols, mathematical symbols and Greek letters +// See the HTML4.0 spec for this list in it's DTD form +HTML_ENTITY(fnof, 402) +HTML_ENTITY(Alpha, 913) +HTML_ENTITY(Beta, 914) +HTML_ENTITY(Gamma, 915) +HTML_ENTITY(Delta, 916) +HTML_ENTITY(Epsilon, 917) +HTML_ENTITY(Zeta, 918) +HTML_ENTITY(Eta, 919) +HTML_ENTITY(Theta, 920) +HTML_ENTITY(Iota, 921) +HTML_ENTITY(Kappa, 922) +HTML_ENTITY(Lambda, 923) +HTML_ENTITY(Mu, 924) +HTML_ENTITY(Nu, 925) +HTML_ENTITY(Xi, 926) +HTML_ENTITY(Omicron, 927) +HTML_ENTITY(Pi, 928) +HTML_ENTITY(Rho, 929) +HTML_ENTITY(Sigma, 931) +HTML_ENTITY(Tau, 932) +HTML_ENTITY(Upsilon, 933) +HTML_ENTITY(Phi, 934) +HTML_ENTITY(Chi, 935) +HTML_ENTITY(Psi, 936) +HTML_ENTITY(Omega, 937) +HTML_ENTITY(alpha, 945) +HTML_ENTITY(beta, 946) +HTML_ENTITY(gamma, 947) +HTML_ENTITY(delta, 948) +HTML_ENTITY(epsilon, 949) +HTML_ENTITY(zeta, 950) +HTML_ENTITY(eta, 951) +HTML_ENTITY(theta, 952) +HTML_ENTITY(iota, 953) +HTML_ENTITY(kappa, 954) +HTML_ENTITY(lambda, 955) +HTML_ENTITY(mu, 956) +HTML_ENTITY(nu, 957) +HTML_ENTITY(xi, 958) +HTML_ENTITY(omicron, 959) +HTML_ENTITY(pi, 960) +HTML_ENTITY(rho, 961) +HTML_ENTITY(sigmaf, 962) +HTML_ENTITY(sigma, 963) +HTML_ENTITY(tau, 964) +HTML_ENTITY(upsilon, 965) +HTML_ENTITY(phi, 966) +HTML_ENTITY(chi, 967) +HTML_ENTITY(psi, 968) +HTML_ENTITY(omega, 969) +HTML_ENTITY(thetasym, 977) +HTML_ENTITY(upsih, 978) +HTML_ENTITY(piv, 982) +HTML_ENTITY(bull, 8226) +HTML_ENTITY(hellip, 8230) +HTML_ENTITY(prime, 8242) +HTML_ENTITY(Prime, 8243) +HTML_ENTITY(oline, 8254) +HTML_ENTITY(frasl, 8260) +HTML_ENTITY(weierp, 8472) +HTML_ENTITY(image, 8465) +HTML_ENTITY(real, 8476) +HTML_ENTITY(trade, 8482) +HTML_ENTITY(alefsym, 8501) +HTML_ENTITY(larr, 8592) +HTML_ENTITY(uarr, 8593) +HTML_ENTITY(rarr, 8594) +HTML_ENTITY(darr, 8595) +HTML_ENTITY(harr, 8596) +HTML_ENTITY(crarr, 8629) +HTML_ENTITY(lArr, 8656) +HTML_ENTITY(uArr, 8657) +HTML_ENTITY(rArr, 8658) +HTML_ENTITY(dArr, 8659) +HTML_ENTITY(hArr, 8660) +HTML_ENTITY(forall, 8704) +HTML_ENTITY(part, 8706) +HTML_ENTITY(exist, 8707) +HTML_ENTITY(empty, 8709) +HTML_ENTITY(nabla, 8711) +HTML_ENTITY(isin, 8712) +HTML_ENTITY(notin, 8713) +HTML_ENTITY(ni, 8715) +HTML_ENTITY(prod, 8719) +HTML_ENTITY(sum, 8721) +HTML_ENTITY(minus, 8722) +HTML_ENTITY(lowast, 8727) +HTML_ENTITY(radic, 8730) +HTML_ENTITY(prop, 8733) +HTML_ENTITY(infin, 8734) +HTML_ENTITY(ang, 8736) +HTML_ENTITY(and, 8743) +HTML_ENTITY(or, 8744) +HTML_ENTITY(cap, 8745) +HTML_ENTITY(cup, 8746) +HTML_ENTITY(int, 8747) +HTML_ENTITY(there4, 8756) +HTML_ENTITY(sim, 8764) +HTML_ENTITY(cong, 8773) +HTML_ENTITY(asymp, 8776) +HTML_ENTITY(ne, 8800) +HTML_ENTITY(equiv, 8801) +HTML_ENTITY(le, 8804) +HTML_ENTITY(ge, 8805) +HTML_ENTITY(sub, 8834) +HTML_ENTITY(sup, 8835) +HTML_ENTITY(nsub, 8836) +HTML_ENTITY(sube, 8838) +HTML_ENTITY(supe, 8839) +HTML_ENTITY(oplus, 8853) +HTML_ENTITY(otimes, 8855) +HTML_ENTITY(perp, 8869) +HTML_ENTITY(sdot, 8901) +HTML_ENTITY(lceil, 8968) +HTML_ENTITY(rceil, 8969) +HTML_ENTITY(lfloor, 8970) +HTML_ENTITY(rfloor, 8971) +// Bug 603716: expansions of ⟨ and ⟩ have been modified in HTML5. +// See http://www.w3.org/2003/entities/2007/htmlmathml-f.ent +HTML_ENTITY(lang, 0x27E8) +HTML_ENTITY(rang, 0x27E9) +HTML_ENTITY(loz, 9674) +HTML_ENTITY(spades, 9824) +HTML_ENTITY(clubs, 9827) +HTML_ENTITY(hearts, 9829) +HTML_ENTITY(diams, 9830) + +// Markup-significant and internationalization characters +// See the HTML4.0 spec for this list in it's DTD form +HTML_ENTITY(quot, 34) +HTML_ENTITY(amp, 38) +HTML_ENTITY(lt, 60) +HTML_ENTITY(gt, 62) +HTML_ENTITY(OElig, 338) +HTML_ENTITY(oelig, 339) +HTML_ENTITY(Scaron, 352) +HTML_ENTITY(scaron, 353) +HTML_ENTITY(Yuml, 376) +HTML_ENTITY(circ, 710) +HTML_ENTITY(tilde, 732) +HTML_ENTITY(ensp, 8194) +HTML_ENTITY(emsp, 8195) +HTML_ENTITY(thinsp, 8201) +HTML_ENTITY(zwnj, 8204) +HTML_ENTITY(zwj, 8205) +HTML_ENTITY(lrm, 8206) +HTML_ENTITY(rlm, 8207) +HTML_ENTITY(ndash, 8211) +HTML_ENTITY(mdash, 8212) +HTML_ENTITY(lsquo, 8216) +HTML_ENTITY(rsquo, 8217) +HTML_ENTITY(sbquo, 8218) +HTML_ENTITY(ldquo, 8220) +HTML_ENTITY(rdquo, 8221) +HTML_ENTITY(bdquo, 8222) +HTML_ENTITY(dagger, 8224) +HTML_ENTITY(Dagger, 8225) +HTML_ENTITY(permil, 8240) +HTML_ENTITY(lsaquo, 8249) +HTML_ENTITY(rsaquo, 8250) +HTML_ENTITY(euro, 8364) + +// Navigator entity extensions +// This block of entities needs to be at the bottom of the list since it +// contains duplicate Unicode codepoints. The codepoint to entity name +// mapping (used by Composer) must ignores them, which occurs only +// because they are listed later. + +// apos is from XML +HTML_ENTITY(apos, 39) +// The capitalized versions are required to handle non-standard input. +HTML_ENTITY(AMP, 38) +HTML_ENTITY(COPY, 169) +HTML_ENTITY(GT, 62) +HTML_ENTITY(LT, 60) +HTML_ENTITY(QUOT, 34) +HTML_ENTITY(REG, 174) + diff --git a/components/htmlparser/src/nsHTMLTagList.h b/components/htmlparser/src/nsHTMLTagList.h new file mode 100644 index 000000000..4cb2a61e0 --- /dev/null +++ b/components/htmlparser/src/nsHTMLTagList.h @@ -0,0 +1,197 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +// IWYU pragma: private, include "nsHTMLTags.h" + +/****** + + This file contains the list of all HTML tags. + See nsHTMLTags.h for access to the enum values for tags. + + It is designed to be used as input to various places that will define the + HTML_TAG macro in useful ways through the magic of C preprocessing. + Additionally, it is consumed by the self-regeneration code in + ElementName.java from which nsHtml5ElementName.cpp/h is translated. + See parser/html/java/README.txt. + + If you edit this list, you need to re-run ElementName.java + self-regeneration and the HTML parser Java to C++ translation. + + All entries must be enclosed in the macro HTML_TAG which will have cruel + and unusual things done to it. + + It is recommended (but not strictly necessary) to keep all entries + in alphabetical order. + + The first argument to HTML_TAG is the tag name. The second argument is the + "creator" method of the form NS_New$TAGNAMEElement, that will be used by + nsHTMLContentSink.cpp to create a content object for a tag of that + type. Use NOTUSED, if the particular tag has a non-standard creator. + The third argument is the interface name specified for this element + in the HTML specification. It can be empty if the relevant interface name + is "HTMLElement". + + The HTML_OTHER macro is for values in the nsHTMLTag enum that are + not strictly tags. + + Entries *must* use only lowercase characters. + + Don't forget to update /editor/libeditor/HTMLEditUtils.cpp as well. + + ** Break these invariants and bad things will happen. ** + + ******/ +#define HTML_HTMLELEMENT_TAG(_tag) HTML_TAG(_tag, , ) + +HTML_TAG(a, Anchor, Anchor) +HTML_HTMLELEMENT_TAG(abbr) +HTML_HTMLELEMENT_TAG(acronym) +HTML_HTMLELEMENT_TAG(address) +HTML_TAG(applet, SharedObject, Applet) +HTML_TAG(area, Area, Area) +HTML_HTMLELEMENT_TAG(article) +HTML_HTMLELEMENT_TAG(aside) +HTML_TAG(audio, Audio, Audio) +HTML_HTMLELEMENT_TAG(b) +HTML_TAG(base, Shared, Base) +HTML_HTMLELEMENT_TAG(basefont) +HTML_HTMLELEMENT_TAG(bdo) +HTML_TAG(bgsound, Unknown, Unknown) +HTML_HTMLELEMENT_TAG(big) +HTML_TAG(blockquote, Shared, Quote) +HTML_TAG(body, Body, Body) +HTML_TAG(br, BR, BR) +HTML_TAG(button, Button, Button) +HTML_TAG(canvas, Canvas, Canvas) +HTML_TAG(caption, TableCaption, TableCaption) +HTML_HTMLELEMENT_TAG(center) +HTML_HTMLELEMENT_TAG(cite) +HTML_HTMLELEMENT_TAG(code) +HTML_TAG(col, TableCol, TableCol) +HTML_TAG(colgroup, TableCol, TableCol) +HTML_TAG(data, Data, Data) +HTML_TAG(datalist, DataList, DataList) +HTML_HTMLELEMENT_TAG(dd) +HTML_TAG(del, Mod, Mod) +HTML_TAG(details, Details, Details) +HTML_HTMLELEMENT_TAG(dfn) +HTML_TAG(dialog, Dialog, Dialog) +HTML_TAG(dir, Shared, Directory) +HTML_TAG(div, Div, Div) +HTML_TAG(dl, SharedList, DList) +HTML_HTMLELEMENT_TAG(dt) +HTML_HTMLELEMENT_TAG(em) +HTML_TAG(embed, SharedObject, Embed) +HTML_TAG(fieldset, FieldSet, FieldSet) +HTML_HTMLELEMENT_TAG(figcaption) +HTML_HTMLELEMENT_TAG(figure) +HTML_TAG(font, Font, Font) +HTML_HTMLELEMENT_TAG(footer) +HTML_TAG(form, Form, Form) +HTML_TAG(frame, Frame, Frame) +HTML_TAG(frameset, FrameSet, FrameSet) +HTML_TAG(h1, Heading, Heading) +HTML_TAG(h2, Heading, Heading) +HTML_TAG(h3, Heading, Heading) +HTML_TAG(h4, Heading, Heading) +HTML_TAG(h5, Heading, Heading) +HTML_TAG(h6, Heading, Heading) +HTML_TAG(head, Shared, Head) +HTML_HTMLELEMENT_TAG(header) +HTML_HTMLELEMENT_TAG(hgroup) +HTML_TAG(hr, HR, HR) +HTML_TAG(html, Shared, Html) +HTML_HTMLELEMENT_TAG(i) +HTML_TAG(iframe, IFrame, IFrame) +HTML_HTMLELEMENT_TAG(image) +HTML_TAG(img, Image, Image) +HTML_TAG(input, Input, Input) +HTML_TAG(ins, Mod, Mod) +HTML_HTMLELEMENT_TAG(kbd) +HTML_TAG(keygen, Span, Span) +HTML_TAG(label, Label, Label) +HTML_TAG(legend, Legend, Legend) +HTML_TAG(li, LI, LI) +HTML_TAG(link, Link, Link) +HTML_TAG(listing, Pre, Pre) +HTML_HTMLELEMENT_TAG(main) +HTML_TAG(map, Map, Map) +HTML_HTMLELEMENT_TAG(mark) +HTML_TAG(menu, Menu, Menu) +HTML_TAG(menuitem, MenuItem, MenuItem) +HTML_TAG(meta, Meta, Meta) +HTML_TAG(meter, Meter, Meter) +HTML_TAG(multicol, Unknown, Unknown) +HTML_HTMLELEMENT_TAG(nav) +HTML_HTMLELEMENT_TAG(nobr) +HTML_HTMLELEMENT_TAG(noembed) +HTML_HTMLELEMENT_TAG(noframes) +HTML_HTMLELEMENT_TAG(noscript) +HTML_TAG(object, Object, Object) +HTML_TAG(ol, SharedList, OList) +HTML_TAG(optgroup, OptGroup, OptGroup) +HTML_TAG(option, Option, Option) +HTML_TAG(output, Output, Output) +HTML_TAG(p, Paragraph, Paragraph) +HTML_TAG(param, Shared, Param) +HTML_TAG(picture, Picture, Picture) +HTML_HTMLELEMENT_TAG(plaintext) +HTML_TAG(pre, Pre, Pre) +HTML_TAG(progress, Progress, Progress) +HTML_TAG(q, Shared, Quote) +HTML_HTMLELEMENT_TAG(rb) +HTML_HTMLELEMENT_TAG(rp) +HTML_HTMLELEMENT_TAG(rt) +HTML_HTMLELEMENT_TAG(rtc) +HTML_HTMLELEMENT_TAG(ruby) +HTML_HTMLELEMENT_TAG(s) +HTML_HTMLELEMENT_TAG(samp) +HTML_TAG(script, Script, Script) +HTML_HTMLELEMENT_TAG(section) +HTML_TAG(select, Select, Select) +HTML_HTMLELEMENT_TAG(small) +HTML_TAG(slot, Slot, Slot) +HTML_TAG(source, Source, Source) +HTML_TAG(span, Span, Span) +HTML_HTMLELEMENT_TAG(strike) +HTML_HTMLELEMENT_TAG(strong) +HTML_TAG(style, Style, Style) +HTML_HTMLELEMENT_TAG(sub) +HTML_TAG(summary, Summary, ) +HTML_HTMLELEMENT_TAG(sup) +HTML_TAG(table, Table, Table) +HTML_TAG(tbody, TableSection, TableSection) +HTML_TAG(td, TableCell, TableCell) +HTML_TAG(textarea, TextArea, TextArea) +HTML_TAG(tfoot, TableSection, TableSection) +HTML_TAG(th, TableCell, TableCell) +HTML_TAG(thead, TableSection, TableSection) +HTML_TAG(template, Template, Template) +HTML_TAG(time, Time, Time) +HTML_TAG(title, Title, Title) +HTML_TAG(tr, TableRow, TableRow) +HTML_TAG(track, Track, Track) +HTML_HTMLELEMENT_TAG(tt) +HTML_HTMLELEMENT_TAG(u) +HTML_TAG(ul, SharedList, UList) +HTML_HTMLELEMENT_TAG(var) +HTML_TAG(video, Video, Video) +HTML_HTMLELEMENT_TAG(wbr) +HTML_TAG(xmp, Pre, Pre) + + +/* These are not for tags. But they will be included in the nsHTMLTag + enum anyway */ + +HTML_OTHER(text) +HTML_OTHER(whitespace) +HTML_OTHER(newline) +HTML_OTHER(comment) +HTML_OTHER(entity) +HTML_OTHER(doctypeDecl) +HTML_OTHER(markupDecl) +HTML_OTHER(instruction) + +#undef HTML_HTMLELEMENT_TAG diff --git a/components/htmlparser/src/nsHTMLTags.cpp b/components/htmlparser/src/nsHTMLTags.cpp new file mode 100644 index 000000000..681c37489 --- /dev/null +++ b/components/htmlparser/src/nsHTMLTags.cpp @@ -0,0 +1,259 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsHTMLTags.h" +#include "nsCRT.h" +#include "nsReadableUtils.h" +#include "nsString.h" +#include "nsStaticAtom.h" +#include "nsUnicharUtils.h" +#include "mozilla/HashFunctions.h" +#include <algorithm> + +using namespace mozilla; + +// static array of unicode tag names +#define HTML_TAG(_tag, _classname, _interfacename) (u"" #_tag), +#define HTML_OTHER(_tag) +const char16_t* const nsHTMLTags::sTagUnicodeTable[] = { +#include "nsHTMLTagList.h" +}; +#undef HTML_TAG +#undef HTML_OTHER + +// static array of tag atoms +nsIAtom* nsHTMLTags::sTagAtomTable[eHTMLTag_userdefined - 1]; + +int32_t nsHTMLTags::gTableRefCount; +PLHashTable* nsHTMLTags::gTagTable; +PLHashTable* nsHTMLTags::gTagAtomTable; + + +// char16_t* -> id hash +static PLHashNumber +HTMLTagsHashCodeUCPtr(const void *key) +{ + return HashString(static_cast<const char16_t*>(key)); +} + +static int +HTMLTagsKeyCompareUCPtr(const void *key1, const void *key2) +{ + const char16_t *str1 = (const char16_t *)key1; + const char16_t *str2 = (const char16_t *)key2; + + return nsCRT::strcmp(str1, str2) == 0; +} + +// nsIAtom* -> id hash +static PLHashNumber +HTMLTagsHashCodeAtom(const void *key) +{ + return NS_PTR_TO_INT32(key) >> 2; +} + +#define NS_HTMLTAG_NAME_MAX_LENGTH 10 + +// static +void +nsHTMLTags::RegisterAtoms(void) +{ +#define HTML_TAG(_tag, _classname, _interfacename) NS_STATIC_ATOM_BUFFER(Atombuffer_##_tag, #_tag) +#define HTML_OTHER(_tag) +#include "nsHTMLTagList.h" +#undef HTML_TAG +#undef HTML_OTHER + +// static array of tag StaticAtom structs +#define HTML_TAG(_tag, _classname, _interfacename) NS_STATIC_ATOM(Atombuffer_##_tag, &nsHTMLTags::sTagAtomTable[eHTMLTag_##_tag - 1]), +#define HTML_OTHER(_tag) + static const nsStaticAtom sTagAtoms_info[] = { +#include "nsHTMLTagList.h" + }; +#undef HTML_TAG +#undef HTML_OTHER + + // Fill in our static atom pointers + NS_RegisterStaticAtoms(sTagAtoms_info); + + +#if defined(DEBUG) + { + // let's verify that all names in the the table are lowercase... + for (int32_t i = 0; i < NS_HTML_TAG_MAX; ++i) { + nsAutoString temp1((char16_t*)sTagAtoms_info[i].mStringBuffer->Data()); + nsAutoString temp2((char16_t*)sTagAtoms_info[i].mStringBuffer->Data()); + ToLowerCase(temp1); + NS_ASSERTION(temp1.Equals(temp2), "upper case char in table"); + } + + // let's verify that all names in the unicode strings above are + // correct. + for (int32_t i = 0; i < NS_HTML_TAG_MAX; ++i) { + nsAutoString temp1(sTagUnicodeTable[i]); + nsAutoString temp2((char16_t*)sTagAtoms_info[i].mStringBuffer->Data()); + NS_ASSERTION(temp1.Equals(temp2), "Bad unicode tag name!"); + } + + // let's verify that NS_HTMLTAG_NAME_MAX_LENGTH is correct + uint32_t maxTagNameLength = 0; + for (int32_t i = 0; i < NS_HTML_TAG_MAX; ++i) { + uint32_t len = NS_strlen(sTagUnicodeTable[i]); + maxTagNameLength = std::max(len, maxTagNameLength); + } + NS_ASSERTION(maxTagNameLength == NS_HTMLTAG_NAME_MAX_LENGTH, + "NS_HTMLTAG_NAME_MAX_LENGTH not set correctly!"); + } +#endif +} + +// static +nsresult +nsHTMLTags::AddRefTable(void) +{ + if (gTableRefCount++ == 0) { + NS_ASSERTION(!gTagTable && !gTagAtomTable, "pre existing hash!"); + + gTagTable = PL_NewHashTable(64, HTMLTagsHashCodeUCPtr, + HTMLTagsKeyCompareUCPtr, PL_CompareValues, + nullptr, nullptr); + NS_ENSURE_TRUE(gTagTable, NS_ERROR_OUT_OF_MEMORY); + + gTagAtomTable = PL_NewHashTable(64, HTMLTagsHashCodeAtom, + PL_CompareValues, PL_CompareValues, + nullptr, nullptr); + NS_ENSURE_TRUE(gTagAtomTable, NS_ERROR_OUT_OF_MEMORY); + + // Fill in gTagTable with the above static char16_t strings as + // keys and the value of the corresponding enum as the value in + // the table. + + int32_t i; + for (i = 0; i < NS_HTML_TAG_MAX; ++i) { + PL_HashTableAdd(gTagTable, sTagUnicodeTable[i], + NS_INT32_TO_PTR(i + 1)); + + PL_HashTableAdd(gTagAtomTable, sTagAtomTable[i], + NS_INT32_TO_PTR(i + 1)); + } + } + + return NS_OK; +} + +// static +void +nsHTMLTags::ReleaseTable(void) +{ + if (0 == --gTableRefCount) { + if (gTagTable) { + // Nothing to delete/free in this table, just destroy the table. + + PL_HashTableDestroy(gTagTable); + PL_HashTableDestroy(gTagAtomTable); + gTagTable = nullptr; + gTagAtomTable = nullptr; + } + } +} + +// static +nsHTMLTag +nsHTMLTags::StringTagToId(const nsAString& aTagName) +{ + uint32_t length = aTagName.Length(); + + if (length > NS_HTMLTAG_NAME_MAX_LENGTH) { + return eHTMLTag_userdefined; + } + + char16_t buf[NS_HTMLTAG_NAME_MAX_LENGTH + 1]; + + nsAString::const_iterator iter; + uint32_t i = 0; + char16_t c; + + aTagName.BeginReading(iter); + + // Fast lowercasing-while-copying of ASCII characters into a + // char16_t buffer + + while (i < length) { + c = *iter; + + if (c <= 'Z' && c >= 'A') { + c |= 0x20; // Lowercase the ASCII character. + } + + buf[i] = c; // Copy ASCII character. + + ++i; + ++iter; + } + + buf[i] = 0; + + return CaseSensitiveStringTagToId(buf); +} + +#ifdef DEBUG +void +nsHTMLTags::TestTagTable() +{ + const char16_t *tag; + nsHTMLTag id; + nsCOMPtr<nsIAtom> atom; + + nsHTMLTags::AddRefTable(); + // Make sure we can find everything we are supposed to + for (int i = 0; i < NS_HTML_TAG_MAX; ++i) { + tag = sTagUnicodeTable[i]; + id = StringTagToId(nsDependentString(tag)); + NS_ASSERTION(id != eHTMLTag_userdefined, "can't find tag id"); + const char16_t* check = GetStringValue(id); + NS_ASSERTION(0 == nsCRT::strcmp(check, tag), "can't map id back to tag"); + + nsAutoString uname(tag); + ToUpperCase(uname); + NS_ASSERTION(id == StringTagToId(uname), "wrong id"); + + NS_ASSERTION(id == CaseSensitiveStringTagToId(tag), "wrong id"); + + atom = NS_Atomize(tag); + NS_ASSERTION(id == CaseSensitiveAtomTagToId(atom), "wrong id"); + NS_ASSERTION(atom == GetAtom(id), "can't map id back to atom"); + } + + // Make sure we don't find things that aren't there + id = StringTagToId(NS_LITERAL_STRING("@")); + NS_ASSERTION(id == eHTMLTag_userdefined, "found @"); + id = StringTagToId(NS_LITERAL_STRING("zzzzz")); + NS_ASSERTION(id == eHTMLTag_userdefined, "found zzzzz"); + + atom = NS_Atomize("@"); + id = CaseSensitiveAtomTagToId(atom); + NS_ASSERTION(id == eHTMLTag_userdefined, "found @"); + atom = NS_Atomize("zzzzz"); + id = CaseSensitiveAtomTagToId(atom); + NS_ASSERTION(id == eHTMLTag_userdefined, "found zzzzz"); + + tag = GetStringValue((nsHTMLTag) 0); + NS_ASSERTION(!tag, "found enum 0"); + tag = GetStringValue((nsHTMLTag) -1); + NS_ASSERTION(!tag, "found enum -1"); + tag = GetStringValue((nsHTMLTag) (NS_HTML_TAG_MAX + 1)); + NS_ASSERTION(!tag, "found past max enum"); + + atom = GetAtom((nsHTMLTag) 0); + NS_ASSERTION(!atom, "found enum 0"); + atom = GetAtom((nsHTMLTag) -1); + NS_ASSERTION(!atom, "found enum -1"); + atom = GetAtom((nsHTMLTag) (NS_HTML_TAG_MAX + 1)); + NS_ASSERTION(!atom, "found past max enum"); + + ReleaseTable(); +} + +#endif // DEBUG diff --git a/components/htmlparser/src/nsHTMLTags.h b/components/htmlparser/src/nsHTMLTags.h new file mode 100644 index 000000000..b21df55f8 --- /dev/null +++ b/components/htmlparser/src/nsHTMLTags.h @@ -0,0 +1,100 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsHTMLTags_h___ +#define nsHTMLTags_h___ + +#include "nsIAtom.h" +#include "nsString.h" +#include "plhash.h" + +class nsIAtom; + +/* + Declare the enum list using the magic of preprocessing + enum values are "eHTMLTag_foo" (where foo is the tag) + + To change the list of tags, see nsHTMLTagList.h + + These enum values are used as the index of array in various places. + If we change the structure of the enum by adding entries to it or removing + entries from it _directly_, not via nsHTMLTagList.h, don't forget to update + dom/bindings/BindingUtils.cpp and dom/html/nsHTMLContentSink.cpp as well. + */ +#define HTML_TAG(_tag, _classname, _interfacename) eHTMLTag_##_tag, +#define HTML_OTHER(_tag) eHTMLTag_##_tag, +enum nsHTMLTag { + /* this enum must be first and must be zero */ + eHTMLTag_unknown = 0, +#include "nsHTMLTagList.h" + + /* can't be moved into nsHTMLTagList since gcc3.4 doesn't like a + comma at the end of enum list*/ + eHTMLTag_userdefined +}; +#undef HTML_TAG +#undef HTML_OTHER + +// All tags before eHTMLTag_text are HTML tags +#define NS_HTML_TAG_MAX int32_t(eHTMLTag_text - 1) + +class nsHTMLTags { +public: + static void RegisterAtoms(void); + static nsresult AddRefTable(void); + static void ReleaseTable(void); + + // Functions for converting string or atom to id + static nsHTMLTag StringTagToId(const nsAString& aTagName); + static nsHTMLTag AtomTagToId(nsIAtom* aTagName) + { + return StringTagToId(nsDependentAtomString(aTagName)); + } + + static nsHTMLTag CaseSensitiveStringTagToId(const char16_t* aTagName) + { + NS_ASSERTION(gTagTable, "no lookup table, needs addref"); + NS_ASSERTION(aTagName, "null tagname!"); + + void* tag = PL_HashTableLookupConst(gTagTable, aTagName); + + return tag ? (nsHTMLTag)NS_PTR_TO_INT32(tag) : eHTMLTag_userdefined; + } + static nsHTMLTag CaseSensitiveAtomTagToId(nsIAtom* aTagName) + { + NS_ASSERTION(gTagAtomTable, "no lookup table, needs addref"); + NS_ASSERTION(aTagName, "null tagname!"); + + void* tag = PL_HashTableLookupConst(gTagAtomTable, aTagName); + + return tag ? (nsHTMLTag)NS_PTR_TO_INT32(tag) : eHTMLTag_userdefined; + } + + // Functions for converting an id to a string or atom + static const char16_t *GetStringValue(nsHTMLTag aEnum) + { + return aEnum <= eHTMLTag_unknown || aEnum > NS_HTML_TAG_MAX ? + nullptr : sTagUnicodeTable[aEnum - 1]; + } + static nsIAtom *GetAtom(nsHTMLTag aEnum) + { + return aEnum <= eHTMLTag_unknown || aEnum > NS_HTML_TAG_MAX ? + nullptr : sTagAtomTable[aEnum - 1]; + } + +#ifdef DEBUG + static void TestTagTable(); +#endif + +private: + static nsIAtom* sTagAtomTable[eHTMLTag_userdefined - 1]; + static const char16_t* const sTagUnicodeTable[]; + + static int32_t gTableRefCount; + static PLHashTable* gTagTable; + static PLHashTable* gTagAtomTable; +}; + +#endif /* nsHTMLTags_h___ */ diff --git a/components/htmlparser/src/nsHTMLTokenizer.cpp b/components/htmlparser/src/nsHTMLTokenizer.cpp new file mode 100644 index 000000000..a40e11f0e --- /dev/null +++ b/components/htmlparser/src/nsHTMLTokenizer.cpp @@ -0,0 +1,59 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + +/** + * @file nsHTMLTokenizer.cpp + * This is an implementation of the nsITokenizer interface. + * This file contains the implementation of a tokenizer to tokenize an HTML + * document. It attempts to do so, making tradeoffs between compatibility with + * older parsers and the SGML specification. Note that most of the real + * "tokenization" takes place in nsHTMLTokens.cpp. + */ + +#include "nsHTMLTokenizer.h" +#include "nsIParser.h" +#include "nsParserConstants.h" + +/************************************************************************ + And now for the main class -- nsHTMLTokenizer... + ************************************************************************/ + +/** + * Satisfy the nsISupports interface. + */ +NS_IMPL_ISUPPORTS(nsHTMLTokenizer, nsITokenizer) + +/** + * Default constructor + */ +nsHTMLTokenizer::nsHTMLTokenizer() +{ + // TODO Assert about:blank-ness. +} + +nsresult +nsHTMLTokenizer::WillTokenize(bool aIsFinalChunk) +{ + return NS_OK; +} + +/** + * This method is repeatedly called by the tokenizer. + * Each time, we determine the kind of token we're about to + * read, and then we call the appropriate method to handle + * that token type. + * + * @param aScanner The source of our input. + * @param aFlushTokens An OUT parameter to tell the caller whether it should + * process our queued tokens up to now (e.g., when we + * reach a <script>). + * @return Success or error + */ +nsresult +nsHTMLTokenizer::ConsumeToken(nsScanner& aScanner, bool& aFlushTokens) +{ + return kEOF; +} diff --git a/components/htmlparser/src/nsHTMLTokenizer.h b/components/htmlparser/src/nsHTMLTokenizer.h new file mode 100644 index 000000000..0d2940c5e --- /dev/null +++ b/components/htmlparser/src/nsHTMLTokenizer.h @@ -0,0 +1,35 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + +/** + * MODULE NOTES: + * @update gess 4/1/98 + * + */ + +#ifndef __NSHTMLTOKENIZER +#define __NSHTMLTOKENIZER + +#include "mozilla/Attributes.h" +#include "nsISupports.h" +#include "nsITokenizer.h" + +#ifdef _MSC_VER +#pragma warning( disable : 4275 ) +#endif + +class nsHTMLTokenizer final : public nsITokenizer { + ~nsHTMLTokenizer() {} + +public: + NS_DECL_ISUPPORTS + NS_DECL_NSITOKENIZER + nsHTMLTokenizer(); +}; + +#endif + + diff --git a/components/htmlparser/src/nsIContentSink.h b/components/htmlparser/src/nsIContentSink.h new file mode 100644 index 000000000..56c70a1b4 --- /dev/null +++ b/components/htmlparser/src/nsIContentSink.h @@ -0,0 +1,132 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsIContentSink_h___ +#define nsIContentSink_h___ + +/** + * MODULE NOTES: + * @update gess 4/1/98 + * + * This pure virtual interface is used as the "glue" that connects the parsing + * process to the content model construction process. + * + * The icontentsink interface is a very lightweight wrapper that represents the + * content-sink model building process. There is another one that you may care + * about more, which is the IHTMLContentSink interface. (See that file for details). + */ +#include "nsISupports.h" +#include "nsString.h" +#include "mozFlushType.h" +#include "nsIDTD.h" + +class nsParserBase; + +#define NS_ICONTENT_SINK_IID \ +{ 0xcf9a7cbb, 0xfcbc, 0x4e13, \ + { 0x8e, 0xf5, 0x18, 0xef, 0x2d, 0x3d, 0x58, 0x29 } } + +class nsIContentSink : public nsISupports { +public: + + NS_DECLARE_STATIC_IID_ACCESSOR(NS_ICONTENT_SINK_IID) + + /** + * This method is called by the parser when it is entered from + * the event loop. The content sink wants to know how long the + * parser has been active since we last processed events on the + * main event loop and this call calibrates that measurement. + */ + NS_IMETHOD WillParse(void)=0; + + /** + * This method gets called when the parser begins the process + * of building the content model via the content sink. + * + * Default implementation provided since the sink should have the option of + * doing nothing in response to this call. + * + * @update 5/7/98 gess + */ + NS_IMETHOD WillBuildModel(nsDTDMode aDTDMode) { + return NS_OK; + } + + /** + * This method gets called when the parser concludes the process + * of building the content model via the content sink. + * + * Default implementation provided since the sink should have the option of + * doing nothing in response to this call. + * + * @update 5/7/98 gess + */ + NS_IMETHOD DidBuildModel(bool aTerminated) { + return NS_OK; + } + + /** + * This method gets called when the parser gets i/o blocked, + * and wants to notify the sink that it may be a while before + * more data is available. + * + * @update 5/7/98 gess + */ + NS_IMETHOD WillInterrupt(void)=0; + + /** + * This method gets called when the parser i/o gets unblocked, + * and we're about to start dumping content again to the sink. + * + * @update 5/7/98 gess + */ + NS_IMETHOD WillResume(void)=0; + + /** + * This method gets called by the parser so that the content + * sink can retain a reference to the parser. The expectation + * is that the content sink will drop the reference when it + * gets the DidBuildModel notification i.e. when parsing is done. + */ + NS_IMETHOD SetParser(nsParserBase* aParser)=0; + + /** + * Flush content so that the content model is in sync with the state + * of the sink. + * + * @param aType the type of flush to perform + */ + virtual void FlushPendingNotifications(mozFlushType aType)=0; + + /** + * Set the document character set. This should be passed on to the + * document itself. + */ + NS_IMETHOD SetDocumentCharset(nsACString& aCharset)=0; + + /** + * Returns the target object (often a document object) into which + * the content built by this content sink is being added, if any + * (IOW, may return null). + */ + virtual nsISupports *GetTarget()=0; + + /** + * Returns true if there's currently script executing that we need to hold + * parsing for. + */ + virtual bool IsScriptExecuting() + { + return false; + } + + /** + * Posts a runnable that continues parsing. + */ + virtual void ContinueInterruptedParsingAsync() {} +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIContentSink, NS_ICONTENT_SINK_IID) + +#endif /* nsIContentSink_h___ */ diff --git a/components/htmlparser/src/nsIDTD.h b/components/htmlparser/src/nsIDTD.h new file mode 100644 index 000000000..cbae4d507 --- /dev/null +++ b/components/htmlparser/src/nsIDTD.h @@ -0,0 +1,136 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsIDTD_h___ +#define nsIDTD_h___ + +/** + * MODULE NOTES: + * @update gess 7/20/98 + * + * This interface defines standard interface for DTD's. Note that this + * isn't HTML specific. DTD's have several functions within the parser + * system: + * 1) To coordinate the consumption of an input stream via the + * parser + * 2) To serve as proxy to represent the containment rules of the + * underlying document + * 3) To offer autodetection services to the parser (mainly for doc + * conversion) + * */ + +#include "nsISupports.h" +#include "nsString.h" +#include "nsITokenizer.h" + +#define NS_IDTD_IID \ +{ 0x3de05873, 0xefa7, 0x410d, \ + { 0xa4, 0x61, 0x80, 0x33, 0xaf, 0xd9, 0xe3, 0x26 } } + +enum eAutoDetectResult { + eUnknownDetect, + eValidDetect, + ePrimaryDetect, + eInvalidDetect +}; + +enum nsDTDMode { + eDTDMode_unknown = 0, + eDTDMode_quirks, //pre 4.0 versions + eDTDMode_almost_standards, + eDTDMode_full_standards, + eDTDMode_autodetect, + eDTDMode_fragment +}; + + +class nsIContentSink; +class CParserContext; + +class nsIDTD : public nsISupports +{ +public: + + NS_DECLARE_STATIC_IID_ACCESSOR(NS_IDTD_IID) + + NS_IMETHOD WillBuildModel(const CParserContext& aParserContext, + nsITokenizer* aTokenizer, + nsIContentSink* aSink) = 0; + + /** + * Called by the parser after the parsing process has concluded + * @update gess5/18/98 + * @param anErrorCode - contains error code resulting from parse process + * @return + */ + NS_IMETHOD DidBuildModel(nsresult anErrorCode) = 0; + + /** + * Called (possibly repeatedly) by the parser to parse tokens and construct + * the document model via the sink provided to WillBuildModel. + * + * @param aTokenizer - tokenizer providing the token stream to be parsed + * @param aCountLines - informs the DTD whether to count newlines + * (not wanted, e.g., when handling document.write) + * @param aCharsetPtr - address of an nsCString containing the charset + * that the DTD should use (pointer in case the DTD + * opts to ignore this parameter) + */ + NS_IMETHOD BuildModel(nsITokenizer* aTokenizer, nsIContentSink* aSink) = 0; + + /** + * This method is called to determine whether or not a tag of one + * type can contain a tag of another type. + * + * @update gess 3/25/98 + * @param aParent -- int tag of parent container + * @param aChild -- int tag of child container + * @return true if parent can contain child + */ + NS_IMETHOD_(bool) CanContain(int32_t aParent,int32_t aChild) const = 0; + + /** + * This method gets called to determine whether a given + * tag is itself a container + * + * @update gess 3/25/98 + * @param aTag -- tag to test for containership + * @return true if given tag can contain other tags + */ + NS_IMETHOD_(bool) IsContainer(int32_t aTag) const = 0; + + /** + * Use this id you want to stop the building content model + * --------------[ Sets DTD to STOP mode ]---------------- + * It's recommended to use this method in accordance with + * the parser's terminate() method. + * + * @update harishd 07/22/99 + * @param + * @return + */ + NS_IMETHOD_(void) Terminate() = 0; + + NS_IMETHOD_(int32_t) GetType() = 0; + + /** + * Call this method after calling WillBuildModel to determine what mode the + * DTD actually is using, as it may differ from aParserContext.mDTDMode. + */ + NS_IMETHOD_(nsDTDMode) GetMode() const = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIDTD, NS_IDTD_IID) + +#define NS_DECL_NSIDTD \ + NS_IMETHOD WillBuildModel( const CParserContext& aParserContext, nsITokenizer* aTokenizer, nsIContentSink* aSink) override;\ + NS_IMETHOD DidBuildModel(nsresult anErrorCode) override;\ + NS_IMETHOD BuildModel(nsITokenizer* aTokenizer, nsIContentSink* aSink) override;\ + NS_IMETHOD_(bool) CanContain(int32_t aParent,int32_t aChild) const override;\ + NS_IMETHOD_(bool) IsContainer(int32_t aTag) const override;\ + NS_IMETHOD_(void) Terminate() override;\ + NS_IMETHOD_(int32_t) GetType() override;\ + NS_IMETHOD_(nsDTDMode) GetMode() const override; +#endif /* nsIDTD_h___ */ diff --git a/components/htmlparser/src/nsIFragmentContentSink.h b/components/htmlparser/src/nsIFragmentContentSink.h new file mode 100644 index 000000000..8d547ed66 --- /dev/null +++ b/components/htmlparser/src/nsIFragmentContentSink.h @@ -0,0 +1,77 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsIFragmentContentSink_h___ +#define nsIFragmentContentSink_h___ + +#include "nsISupports.h" + +class nsIDOMDocumentFragment; +class nsIDocument; + +#define NS_I_FRAGMENT_CONTENT_SINK_IID \ + { 0x1a8ce30b, 0x63fc, 0x441a, \ + { 0xa3, 0xaa, 0xf7, 0x16, 0xc0, 0xfe, 0x96, 0x69 } } + +/** + * The fragment sink allows a client to parse a fragment of sink, possibly + * surrounded in context. Also see nsIParser::ParseFragment(). + * Note: once you've parsed a fragment, the fragment sink must be re-set on + * the parser in order to parse another fragment. + */ +class nsIFragmentContentSink : public nsISupports { +public: + NS_DECLARE_STATIC_IID_ACCESSOR(NS_I_FRAGMENT_CONTENT_SINK_IID) + /** + * This method is used to obtain the fragment created by + * a fragment content sink and to release resources held by the parser. + * + * The sink drops its reference to the fragment. + */ + NS_IMETHOD FinishFragmentParsing(nsIDOMDocumentFragment** aFragment) = 0; + + /** + * This method is used to set the target document for this fragment + * sink. This document's nodeinfo manager will be used to create + * the content objects. This MUST be called before the sink is used. + * + * @param aDocument the document the new nodes will belong to + * (should not be null) + */ + NS_IMETHOD SetTargetDocument(nsIDocument* aDocument) = 0; + + /** + * This method is used to indicate to the sink that we're done building + * the context and should start paying attention to the incoming content + */ + NS_IMETHOD WillBuildContent() = 0; + + /** + * This method is used to indicate to the sink that we're done building + * The real content. This is useful if you want to parse additional context + * (such as an end context). + */ + NS_IMETHOD DidBuildContent() = 0; + + /** + * This method is a total hack to help with parsing fragments. It is called to + * tell the fragment sink that a container from the context will be delivered + * after the call to WillBuildContent(). This is only relevent for HTML + * fragments that use nsHTMLTokenizer/CNavDTD. + */ + NS_IMETHOD IgnoreFirstContainer() = 0; + + /** + * Sets whether scripts elements are marked as unexecutable. + */ + NS_IMETHOD SetPreventScriptExecution(bool aPreventScriptExecution) = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIFragmentContentSink, + NS_I_FRAGMENT_CONTENT_SINK_IID) + +nsresult +NS_NewXMLFragmentContentSink(nsIFragmentContentSink** aInstancePtrResult); + +#endif diff --git a/components/htmlparser/src/nsIHTMLContentSink.h b/components/htmlparser/src/nsIHTMLContentSink.h new file mode 100644 index 000000000..bf08c4b5e --- /dev/null +++ b/components/htmlparser/src/nsIHTMLContentSink.h @@ -0,0 +1,89 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef nsIHTMLContentSink_h___ +#define nsIHTMLContentSink_h___ + +/** + * This interface is OBSOLETE and in the process of being REMOVED. + * Do NOT implement! + * + * This file declares the concrete HTMLContentSink class. + * This class is used during the parsing process as the + * primary interface between the parser and the content + * model. + * + * After the tokenizer completes, the parser iterates over + * the known token list. As the parser identifies valid + * elements, it calls the contentsink interface to notify + * the content model that a new node or child node is being + * created and added to the content model. + * + * The HTMLContentSink interface assumes 4 underlying + * containers: HTML, HEAD, BODY and FRAMESET. Before + * accessing any these, the parser will call the appropriate + * OpennsIHTMLContentSink method: OpenHTML,OpenHead,OpenBody,OpenFrameSet; + * likewise, the ClosensIHTMLContentSink version will be called when the + * parser is done with a given section. + * + * IMPORTANT: The parser may Open each container more than + * once! This is due to the irregular nature of HTML files. + * For example, it is possible to encounter plain text at + * the start of an HTML document (that precedes the HTML tag). + * Such text is treated as if it were part of the body. + * In such cases, the parser will Open the body, pass the text- + * node in and then Close the body. The body will likely be + * re-Opened later when the actual <BODY> tag has been seen. + * + * Containers within the body are Opened and Closed + * using the OpenContainer(...) and CloseContainer(...) calls. + * It is assumed that the document or contentSink is + * maintaining its state to manage where new content should + * be added to the underlying document. + * + * NOTE: OpenHTML() and OpenBody() may get called multiple times + * in the same document. That's fine, and it doesn't mean + * that we have multiple bodies or HTML's. + * + * NOTE: I haven't figured out how sub-documents (non-frames) + * are going to be handled. Stay tuned. + */ +#include "nsIContentSink.h" +#include "nsHTMLTags.h" + +#define NS_IHTML_CONTENT_SINK_IID \ + {0xefc5af86, 0x5cfd, 0x4918, {0x9d, 0xd3, 0x5f, 0x7a, 0xb2, 0x88, 0xb2, 0x68}} + +/** + * This interface is OBSOLETE and in the process of being REMOVED. + * Do NOT implement! + */ +class nsIHTMLContentSink : public nsIContentSink +{ +public: + + NS_DECLARE_STATIC_IID_ACCESSOR(NS_IHTML_CONTENT_SINK_IID) + + enum ElementType { eHTML, eBody }; + + /** + * This method is used to open a generic container in the sink. + * + * @update 4/1/98 gess + */ + NS_IMETHOD OpenContainer(ElementType aNodeType) = 0; + + /** + * This method gets called by the parser when a close + * container tag has been consumed and needs to be closed. + * + * @param aTag - The tag to be closed. + */ + NS_IMETHOD CloseContainer(ElementType aTag) = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIHTMLContentSink, NS_IHTML_CONTENT_SINK_IID) + +#endif /* nsIHTMLContentSink_h___ */ + diff --git a/components/htmlparser/src/nsIParser.h b/components/htmlparser/src/nsIParser.h new file mode 100644 index 000000000..4bf0b3370 --- /dev/null +++ b/components/htmlparser/src/nsIParser.h @@ -0,0 +1,272 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef NS_IPARSER___ +#define NS_IPARSER___ + + + /** + * This GECKO-INTERNAL interface is on track to being REMOVED (or refactored + * to the point of being near-unrecognizable). + * + * Please DO NOT #include this file in comm-central code, in your XUL + * app or binary extensions. + * + * Please DO NOT #include this into new files even inside Gecko. It is more + * likely than not that #including this header is the wrong thing to do. + */ + +#include "nsISupports.h" +#include "nsIStreamListener.h" +#include "nsIDTD.h" +#include "nsString.h" +#include "nsTArray.h" +#include "nsIAtom.h" +#include "nsParserBase.h" + +#define NS_IPARSER_IID \ +{ 0x2c4ad90a, 0x740e, 0x4212, \ + { 0xba, 0x3f, 0xfe, 0xac, 0xda, 0x4b, 0x92, 0x9e } } + +// {41421C60-310A-11d4-816F-000064657374} +#define NS_IDEBUG_DUMP_CONTENT_IID \ +{ 0x41421c60, 0x310a, 0x11d4, \ +{ 0x81, 0x6f, 0x0, 0x0, 0x64, 0x65, 0x73, 0x74 } } + +class nsIContentSink; +class nsIRequestObserver; +class nsString; +class nsIURI; +class nsIChannel; +class nsIContent; + +enum eParserCommands { + eViewNormal, + eViewSource, + eViewFragment, + eViewErrors +}; + +enum eParserDocType { + ePlainText = 0, + eXML, + eHTML_Quirks, + eHTML_Strict +}; + +enum eStreamState {eNone,eOnStart,eOnDataAvail,eOnStop}; + +/** + * This GECKO-INTERNAL interface is on track to being REMOVED (or refactored + * to the point of being near-unrecognizable). + * + * Please DO NOT #include this file in comm-central code, in your XUL + * app or binary extensions. + * + * Please DO NOT #include this into new files even inside Gecko. It is more + * likely than not that #including this header is the wrong thing to do. + */ +class nsIParser : public nsParserBase { + public: + + NS_DECLARE_STATIC_IID_ACCESSOR(NS_IPARSER_IID) + + /** + * Select given content sink into parser for parser output + * @update gess5/11/98 + * @param aSink is the new sink to be used by parser + * @return + */ + NS_IMETHOD_(void) SetContentSink(nsIContentSink* aSink)=0; + + + /** + * retrieve the sink set into the parser + * @update gess5/11/98 + * @return current sink + */ + NS_IMETHOD_(nsIContentSink*) GetContentSink(void)=0; + + /** + * Call this method once you've created a parser, and want to instruct it + * about the command which caused the parser to be constructed. For example, + * this allows us to select a DTD which can do, say, view-source. + * + * @update gess 3/25/98 + * @param aCommand -- ptrs to string that contains command + * @return nada + */ + NS_IMETHOD_(void) GetCommand(nsCString& aCommand)=0; + NS_IMETHOD_(void) SetCommand(const char* aCommand)=0; + NS_IMETHOD_(void) SetCommand(eParserCommands aParserCommand)=0; + + /** + * Call this method once you've created a parser, and want to instruct it + * about what charset to load + * + * @update ftang 4/23/99 + * @param aCharset- the charest of a document + * @param aCharsetSource- the soure of the chares + * @return nada + */ + NS_IMETHOD_(void) SetDocumentCharset(const nsACString& aCharset, int32_t aSource)=0; + NS_IMETHOD_(void) GetDocumentCharset(nsACString& oCharset, int32_t& oSource)=0; + + /** + * Get the channel associated with this parser + * @update harishd,gagan 07/17/01 + * @param aChannel out param that will contain the result + * @return NS_OK if successful + */ + NS_IMETHOD GetChannel(nsIChannel** aChannel) override = 0; + + /** + * Get the DTD associated with this parser + * @update vidur 9/29/99 + * @param aDTD out param that will contain the result + * @return NS_OK if successful, NS_ERROR_FAILURE for runtime error + */ + NS_IMETHOD GetDTD(nsIDTD** aDTD) = 0; + + /** + * Get the nsIStreamListener for this parser + */ + virtual nsIStreamListener* GetStreamListener() = 0; + + /************************************************************************** + * Parse methods always begin with an input source, and perform + * conversions until you wind up being emitted to the given contentsink + * (which may or may not be a proxy for the NGLayout content model). + ************************************************************************/ + + // Call this method to resume the parser from an unblocked state. + // This can happen, for example, if parsing was interrupted and then the + // consumer needed to restart the parser without waiting for more data. + // This also happens after loading scripts, which unblock the parser in + // order to process the output of document.write() and then need to + // continue on with the page load on an enabled parser. + NS_IMETHOD ContinueInterruptedParsing() = 0; + + // Stops parsing temporarily. + NS_IMETHOD_(void) BlockParser() = 0; + + // Open up the parser for tokenization, building up content + // model..etc. However, this method does not resume parsing + // automatically. It's the callers' responsibility to restart + // the parsing engine. + NS_IMETHOD_(void) UnblockParser() = 0; + + /** + * Asynchronously continues parsing. + */ + NS_IMETHOD_(void) ContinueInterruptedParsingAsync() = 0; + + NS_IMETHOD_(bool) IsParserEnabled() override = 0; + NS_IMETHOD_(bool) IsComplete() = 0; + + NS_IMETHOD Parse(nsIURI* aURL, + nsIRequestObserver* aListener = nullptr, + void* aKey = 0, + nsDTDMode aMode = eDTDMode_autodetect) = 0; + + NS_IMETHOD Terminate(void) = 0; + + /** + * This method gets called when you want to parse a fragment of HTML or XML + * surrounded by the context |aTagStack|. It requires that the parser have + * been given a fragment content sink. + * + * @param aSourceBuffer The XML or HTML that hasn't been parsed yet. + * @param aTagStack The context of the source buffer. + * @return Success or failure. + */ + NS_IMETHOD ParseFragment(const nsAString& aSourceBuffer, + nsTArray<nsString>& aTagStack) = 0; + + /** + * This method gets called when the tokens have been consumed, and it's time + * to build the model via the content sink. + * @update gess5/11/98 + * @return error code -- 0 if model building went well . + */ + NS_IMETHOD BuildModel(void) = 0; + + /** + * Call this method to cancel any pending parsing events. + * Parsing events may be pending if all of the document's content + * has been passed to the parser but the parser has been interrupted + * because processing the tokens took too long. + * + * @update kmcclusk 05/18/01 + * @return NS_OK if succeeded else ERROR. + */ + + NS_IMETHOD CancelParsingEvents() = 0; + + virtual void Reset() = 0; + + /** + * True if the insertion point (per HTML5) is defined. + */ + virtual bool IsInsertionPointDefined() = 0; + + /** + * Call immediately before starting to evaluate a parser-inserted script or + * in general when the spec says to define an insertion point. + */ + virtual void PushDefinedInsertionPoint() = 0; + + /** + * Call immediately after having evaluated a parser-inserted script or + * generally want to restore to the state before the last + * PushDefinedInsertionPoint call. + */ + virtual void PopDefinedInsertionPoint() = 0; + + /** + * Marks the HTML5 parser as not a script-created parser. + */ + virtual void MarkAsNotScriptCreated(const char* aCommand) = 0; + + /** + * True if this is a script-created HTML5 parser. + */ + virtual bool IsScriptCreated() = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIParser, NS_IPARSER_IID) + +/* ===========================================================* + Some useful constants... + * ===========================================================*/ + +#include "nsError.h" + +const nsresult kEOF = NS_ERROR_HTMLPARSER_EOF; +const nsresult kUnknownError = NS_ERROR_HTMLPARSER_UNKNOWN; +const nsresult kCantPropagate = NS_ERROR_HTMLPARSER_CANTPROPAGATE; +const nsresult kContextMismatch = NS_ERROR_HTMLPARSER_CONTEXTMISMATCH; +const nsresult kBadFilename = NS_ERROR_HTMLPARSER_BADFILENAME; +const nsresult kBadURL = NS_ERROR_HTMLPARSER_BADURL; +const nsresult kInvalidParserContext = NS_ERROR_HTMLPARSER_INVALIDPARSERCONTEXT; +const nsresult kBlocked = NS_ERROR_HTMLPARSER_BLOCK; +const nsresult kBadStringLiteral = NS_ERROR_HTMLPARSER_UNTERMINATEDSTRINGLITERAL; +const nsresult kHierarchyTooDeep = NS_ERROR_HTMLPARSER_HIERARCHYTOODEEP; +const nsresult kFakeEndTag = NS_ERROR_HTMLPARSER_FAKE_ENDTAG; +const nsresult kNotAComment = NS_ERROR_HTMLPARSER_INVALID_COMMENT; + +#define NS_IPARSER_FLAG_UNKNOWN_MODE 0x00000000 +#define NS_IPARSER_FLAG_QUIRKS_MODE 0x00000002 +#define NS_IPARSER_FLAG_STRICT_MODE 0x00000004 +#define NS_IPARSER_FLAG_AUTO_DETECT_MODE 0x00000010 +#define NS_IPARSER_FLAG_VIEW_NORMAL 0x00000020 +#define NS_IPARSER_FLAG_VIEW_SOURCE 0x00000040 +#define NS_IPARSER_FLAG_VIEW_ERRORS 0x00000080 +#define NS_IPARSER_FLAG_PLAIN_TEXT 0x00000100 +#define NS_IPARSER_FLAG_XML 0x00000200 +#define NS_IPARSER_FLAG_HTML 0x00000400 +#define NS_IPARSER_FLAG_SCRIPT_ENABLED 0x00000800 +#define NS_IPARSER_FLAG_FRAMES_ENABLED 0x00001000 + +#endif diff --git a/components/htmlparser/src/nsIParserService.h b/components/htmlparser/src/nsIParserService.h new file mode 100644 index 000000000..2906974e9 --- /dev/null +++ b/components/htmlparser/src/nsIParserService.h @@ -0,0 +1,98 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsIParserService_h__ +#define nsIParserService_h__ + +#include "nsISupports.h" +#include "nsString.h" +#include "nsHTMLTags.h" + +class nsIParser; + +#define NS_PARSERSERVICE_CONTRACTID "@mozilla.org/parser/parser-service;1" + +// {90a92e37-abd6-441b-9b39-4064d98e1ede} +#define NS_IPARSERSERVICE_IID \ +{ 0x90a92e37, 0xabd6, 0x441b, { 0x9b, 0x39, 0x40, 0x64, 0xd9, 0x8e, 0x1e, 0xde } } + +class nsIParserService : public nsISupports { + public: + NS_DECLARE_STATIC_IID_ACCESSOR(NS_IPARSERSERVICE_IID) + + /** + * Looks up the nsHTMLTag enum value corresponding to the tag in aAtom. The + * lookup happens case insensitively. + * + * @param aAtom The tag to look up. + * + * @return int32_t The nsHTMLTag enum value corresponding to the tag in aAtom + * or eHTMLTag_userdefined if the tag does not correspond to + * any of the tag nsHTMLTag enum values. + */ + virtual int32_t HTMLAtomTagToId(nsIAtom* aAtom) const = 0; + + /** + * Looks up the nsHTMLTag enum value corresponding to the tag in aAtom. + * + * @param aAtom The tag to look up. + * + * @return int32_t The nsHTMLTag enum value corresponding to the tag in aAtom + * or eHTMLTag_userdefined if the tag does not correspond to + * any of the tag nsHTMLTag enum values. + */ + virtual int32_t HTMLCaseSensitiveAtomTagToId(nsIAtom* aAtom) const = 0; + + /** + * Looks up the nsHTMLTag enum value corresponding to the tag in aTag. The + * lookup happens case insensitively. + * + * @param aTag The tag to look up. + * + * @return int32_t The nsHTMLTag enum value corresponding to the tag in aTag + * or eHTMLTag_userdefined if the tag does not correspond to + * any of the tag nsHTMLTag enum values. + */ + virtual int32_t HTMLStringTagToId(const nsAString& aTag) const = 0; + + /** + * Gets the tag corresponding to the nsHTMLTag enum value in aId. The + * returned tag will be in lowercase. + * + * @param aId The nsHTMLTag enum value to get the tag for. + * + * @return const char16_t* The tag corresponding to the nsHTMLTag enum + * value, or nullptr if the enum value doesn't + * correspond to a tag (eHTMLTag_unknown, + * eHTMLTag_userdefined, eHTMLTag_text, ...). + */ + virtual const char16_t *HTMLIdToStringTag(int32_t aId) const = 0; + + /** + * Gets the tag corresponding to the nsHTMLTag enum value in aId. The + * returned tag will be in lowercase. + * + * @param aId The nsHTMLTag enum value to get the tag for. + * + * @return nsIAtom* The tag corresponding to the nsHTMLTag enum value, or + * nullptr if the enum value doesn't correspond to a tag + * (eHTMLTag_unknown, eHTMLTag_userdefined, eHTMLTag_text, + * ...). + */ + virtual nsIAtom *HTMLIdToAtomTag(int32_t aId) const = 0; + + NS_IMETHOD HTMLConvertEntityToUnicode(const nsAString& aEntity, + int32_t* aUnicode) const = 0; + + NS_IMETHOD HTMLConvertUnicodeToEntity(int32_t aUnicode, + nsCString& aEntity) const = 0; + + NS_IMETHOD IsContainer(int32_t aId, bool& aIsContainer) const = 0; + NS_IMETHOD IsBlock(int32_t aId, bool& aIsBlock) const = 0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsIParserService, NS_IPARSERSERVICE_IID) + +#endif // nsIParserService_h__ diff --git a/components/htmlparser/src/nsITokenizer.h b/components/htmlparser/src/nsITokenizer.h new file mode 100644 index 000000000..2ed09d410 --- /dev/null +++ b/components/htmlparser/src/nsITokenizer.h @@ -0,0 +1,44 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + +/** + * MODULE NOTES: + * @update gess 4/1/98 + * + */ + +#ifndef __NSITOKENIZER__ +#define __NSITOKENIZER__ + +#include "nsISupports.h" + +class nsScanner; + +#define NS_ITOKENIZER_IID \ +{ 0Xae98a348, 0X5e91, 0X41a8, \ + { 0Xa5, 0Xb4, 0Xd2, 0X20, 0Xf3, 0X1f, 0Xc4, 0Xab } } + +/*************************************************************** + Notes: + ***************************************************************/ + + +class nsITokenizer : public nsISupports { +public: + NS_DECLARE_STATIC_IID_ACCESSOR(NS_ITOKENIZER_IID) + + NS_IMETHOD WillTokenize(bool aIsFinalChunk)=0; + NS_IMETHOD ConsumeToken(nsScanner& aScanner,bool& aFlushTokens)=0; +}; + +NS_DEFINE_STATIC_IID_ACCESSOR(nsITokenizer, NS_ITOKENIZER_IID) + +#define NS_DECL_NSITOKENIZER \ + NS_IMETHOD WillTokenize(bool aIsFinalChunk) override;\ + NS_IMETHOD ConsumeToken(nsScanner& aScanner,bool& aFlushTokens) override;\ + + +#endif diff --git a/components/htmlparser/src/nsParser.cpp b/components/htmlparser/src/nsParser.cpp new file mode 100644 index 000000000..791ccf772 --- /dev/null +++ b/components/htmlparser/src/nsParser.cpp @@ -0,0 +1,1599 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsIAtom.h" +#include "nsParser.h" +#include "nsString.h" +#include "nsCRT.h" +#include "nsScanner.h" +#include "plstr.h" +#include "nsIStringStream.h" +#include "nsIChannel.h" +#include "nsICachingChannel.h" +#include "nsIInputStream.h" +#include "CNavDTD.h" +#include "prenv.h" +#include "prlock.h" +#include "prcvar.h" +#include "nsParserCIID.h" +#include "nsReadableUtils.h" +#include "nsCOMPtr.h" +#include "nsExpatDriver.h" +#include "nsIServiceManager.h" +#include "nsICategoryManager.h" +#include "nsISupportsPrimitives.h" +#include "nsIFragmentContentSink.h" +#include "nsStreamUtils.h" +#include "nsHTMLTokenizer.h" +#include "nsDataHashtable.h" +#include "nsXPCOMCIDInternal.h" +#include "nsMimeTypes.h" +#include "mozilla/CondVar.h" +#include "mozilla/Mutex.h" +#include "nsParserConstants.h" +#include "nsCharsetSource.h" +#include "nsContentUtils.h" +#include "nsThreadUtils.h" +#include "nsIHTMLContentSink.h" + +#include "mozilla/dom/EncodingUtils.h" +#include "mozilla/dom/ScriptLoader.h" +#include "mozilla/BinarySearch.h" + +using namespace mozilla; +using mozilla::dom::EncodingUtils; + +#define NS_PARSER_FLAG_PARSER_ENABLED 0x00000002 +#define NS_PARSER_FLAG_OBSERVERS_ENABLED 0x00000004 +#define NS_PARSER_FLAG_PENDING_CONTINUE_EVENT 0x00000008 +#define NS_PARSER_FLAG_FLUSH_TOKENS 0x00000020 +#define NS_PARSER_FLAG_CAN_TOKENIZE 0x00000040 + +//-------------- Begin ParseContinue Event Definition ------------------------ +/* +The parser can be explicitly interrupted by passing a return value of +NS_ERROR_HTMLPARSER_INTERRUPTED from BuildModel on the DTD. This will cause +the parser to stop processing and allow the application to return to the event +loop. The data which was left at the time of interruption will be processed +the next time OnDataAvailable is called. If the parser has received its final +chunk of data then OnDataAvailable will no longer be called by the networking +module, so the parser will schedule a nsParserContinueEvent which will call +the parser to process the remaining data after returning to the event loop. +If the parser is interrupted while processing the remaining data it will +schedule another ParseContinueEvent. The processing of data followed by +scheduling of the continue events will proceed until either: + + 1) All of the remaining data can be processed without interrupting + 2) The parser has been cancelled. + + +This capability is currently used in CNavDTD and nsHTMLContentSink. The +nsHTMLContentSink is notified by CNavDTD when a chunk of tokens is going to be +processed and when each token is processed. The nsHTML content sink records +the time when the chunk has started processing and will return +NS_ERROR_HTMLPARSER_INTERRUPTED if the token processing time has exceeded a +threshold called max tokenizing processing time. This allows the content sink +to limit how much data is processed in a single chunk which in turn gates how +much time is spent away from the event loop. Processing smaller chunks of data +also reduces the time spent in subsequent reflows. + +This capability is most apparent when loading large documents. If the maximum +token processing time is set small enough the application will remain +responsive during document load. + +A side-effect of this capability is that document load is not complete when +the last chunk of data is passed to OnDataAvailable since the parser may have +been interrupted when the last chunk of data arrived. The document is complete +when all of the document has been tokenized and there aren't any pending +nsParserContinueEvents. This can cause problems if the application assumes +that it can monitor the load requests to determine when the document load has +been completed. This is what happens in Mozilla. The document is considered +completely loaded when all of the load requests have been satisfied. To delay +the document load until all of the parsing has been completed the +nsHTMLContentSink adds a dummy parser load request which is not removed until +the nsHTMLContentSink's DidBuildModel is called. The CNavDTD will not call +DidBuildModel until the final chunk of data has been passed to the parser +through the OnDataAvailable and there aren't any pending +nsParserContineEvents. + +Currently the parser is ignores requests to be interrupted during the +processing of script. This is because a document.write followed by JavaScript +calls to manipulate the DOM may fail if the parser was interrupted during the +document.write. + +For more details @see bugzilla bug 76722 +*/ + + +class nsParserContinueEvent : public Runnable +{ +public: + RefPtr<nsParser> mParser; + + explicit nsParserContinueEvent(nsParser* aParser) + : mParser(aParser) + {} + + NS_IMETHOD Run() override + { + mParser->HandleParserContinueEvent(this); + return NS_OK; + } +}; + +//-------------- End ParseContinue Event Definition ------------------------ + +/** + * default constructor + */ +nsParser::nsParser() +{ + Initialize(true); +} + +nsParser::~nsParser() +{ + Cleanup(); +} + +void +nsParser::Initialize(bool aConstructor) +{ + if (aConstructor) { + // Raw pointer + mParserContext = 0; + } + else { + // nsCOMPtrs + mObserver = nullptr; + mUnusedInput.Truncate(); + } + + mContinueEvent = nullptr; + mCharsetSource = kCharsetUninitialized; + mCharset.AssignLiteral("ISO-8859-1"); + mInternalState = NS_OK; + mStreamStatus = NS_OK; + mCommand = eViewNormal; + mFlags = NS_PARSER_FLAG_OBSERVERS_ENABLED | + NS_PARSER_FLAG_PARSER_ENABLED | + NS_PARSER_FLAG_CAN_TOKENIZE; + + mProcessingNetworkData = false; + mIsAboutBlank = false; +} + +void +nsParser::Cleanup() +{ +#ifdef DEBUG + if (mParserContext && mParserContext->mPrevContext) { + NS_WARNING("Extra parser contexts still on the parser stack"); + } +#endif + + while (mParserContext) { + CParserContext *pc = mParserContext->mPrevContext; + delete mParserContext; + mParserContext = pc; + } + + // It should not be possible for this flag to be set when we are getting + // destroyed since this flag implies a pending nsParserContinueEvent, which + // has an owning reference to |this|. + NS_ASSERTION(!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT), "bad"); +} + +NS_IMPL_CYCLE_COLLECTION_CLASS(nsParser) + +NS_IMPL_CYCLE_COLLECTION_UNLINK_BEGIN(nsParser) + NS_IMPL_CYCLE_COLLECTION_UNLINK(mDTD) + NS_IMPL_CYCLE_COLLECTION_UNLINK(mSink) + NS_IMPL_CYCLE_COLLECTION_UNLINK(mObserver) +NS_IMPL_CYCLE_COLLECTION_UNLINK_END + +NS_IMPL_CYCLE_COLLECTION_TRAVERSE_BEGIN(nsParser) + NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mDTD) + NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mSink) + NS_IMPL_CYCLE_COLLECTION_TRAVERSE(mObserver) + CParserContext *pc = tmp->mParserContext; + while (pc) { + cb.NoteXPCOMChild(pc->mTokenizer); + pc = pc->mPrevContext; + } +NS_IMPL_CYCLE_COLLECTION_TRAVERSE_END + +NS_IMPL_CYCLE_COLLECTING_ADDREF(nsParser) +NS_IMPL_CYCLE_COLLECTING_RELEASE(nsParser) +NS_INTERFACE_MAP_BEGIN_CYCLE_COLLECTION(nsParser) + NS_INTERFACE_MAP_ENTRY(nsIStreamListener) + NS_INTERFACE_MAP_ENTRY(nsIParser) + NS_INTERFACE_MAP_ENTRY(nsIRequestObserver) + NS_INTERFACE_MAP_ENTRY(nsISupportsWeakReference) + NS_INTERFACE_MAP_ENTRY_AMBIGUOUS(nsISupports, nsIParser) +NS_INTERFACE_MAP_END + +// The parser continue event is posted only if +// all of the data to parse has been passed to ::OnDataAvailable +// and the parser has been interrupted by the content sink +// because the processing of tokens took too long. + +nsresult +nsParser::PostContinueEvent() +{ + if (!(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT)) { + // If this flag isn't set, then there shouldn't be a live continue event! + NS_ASSERTION(!mContinueEvent, "bad"); + + // This creates a reference cycle between this and the event that is + // broken when the event fires. + nsCOMPtr<nsIRunnable> event = new nsParserContinueEvent(this); + if (NS_FAILED(NS_DispatchToCurrentThread(event))) { + NS_WARNING("failed to dispatch parser continuation event"); + } else { + mFlags |= NS_PARSER_FLAG_PENDING_CONTINUE_EVENT; + mContinueEvent = event; + } + } + return NS_OK; +} + +NS_IMETHODIMP_(void) +nsParser::GetCommand(nsCString& aCommand) +{ + aCommand = mCommandStr; +} + +/** + * Call this method once you've created a parser, and want to instruct it + * about the command which caused the parser to be constructed. For example, + * this allows us to select a DTD which can do, say, view-source. + * + * @param aCommand the command string to set + */ +NS_IMETHODIMP_(void) +nsParser::SetCommand(const char* aCommand) +{ + mCommandStr.Assign(aCommand); + if (mCommandStr.EqualsLiteral("view-source")) { + mCommand = eViewSource; + } else if (mCommandStr.EqualsLiteral("view-fragment")) { + mCommand = eViewFragment; + } else { + mCommand = eViewNormal; + } +} + +/** + * Call this method once you've created a parser, and want to instruct it + * about the command which caused the parser to be constructed. For example, + * this allows us to select a DTD which can do, say, view-source. + * + * @param aParserCommand the command to set + */ +NS_IMETHODIMP_(void) +nsParser::SetCommand(eParserCommands aParserCommand) +{ + mCommand = aParserCommand; +} + +/** + * Call this method once you've created a parser, and want to instruct it + * about what charset to load + * + * @param aCharset- the charset of a document + * @param aCharsetSource- the source of the charset + */ +NS_IMETHODIMP_(void) +nsParser::SetDocumentCharset(const nsACString& aCharset, int32_t aCharsetSource) +{ + mCharset = aCharset; + mCharsetSource = aCharsetSource; + if (mParserContext && mParserContext->mScanner) { + mParserContext->mScanner->SetDocumentCharset(aCharset, aCharsetSource); + } +} + +void +nsParser::SetSinkCharset(nsACString& aCharset) +{ + if (mSink) { + mSink->SetDocumentCharset(aCharset); + } +} + +/** + * This method gets called in order to set the content + * sink for this parser to dump nodes to. + * + * @param nsIContentSink interface for node receiver + */ +NS_IMETHODIMP_(void) +nsParser::SetContentSink(nsIContentSink* aSink) +{ + NS_PRECONDITION(aSink, "sink cannot be null!"); + mSink = aSink; + + if (mSink) { + mSink->SetParser(this); + nsCOMPtr<nsIHTMLContentSink> htmlSink = do_QueryInterface(mSink); + if (htmlSink) { + mIsAboutBlank = true; + } + } +} + +/** + * retrieve the sink set into the parser + * @return current sink + */ +NS_IMETHODIMP_(nsIContentSink*) +nsParser::GetContentSink() +{ + return mSink; +} + +static nsIDTD* +FindSuitableDTD(CParserContext& aParserContext) +{ + // We always find a DTD. + aParserContext.mAutoDetectStatus = ePrimaryDetect; + + // Quick check for view source. + MOZ_ASSERT(aParserContext.mParserCommand != eViewSource, + "The old parser is not supposed to be used for View Source " + "anymore."); + + // Now see if we're parsing HTML (which, as far as we're concerned, simply + // means "not XML"). + if (aParserContext.mDocType != eXML) { + return new CNavDTD(); + } + + // If we're here, then we'd better be parsing XML. + NS_ASSERTION(aParserContext.mDocType == eXML, "What are you trying to send me, here?"); + return new nsExpatDriver(); +} + +NS_IMETHODIMP +nsParser::CancelParsingEvents() +{ + if (mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT) { + NS_ASSERTION(mContinueEvent, "mContinueEvent is null"); + // Revoke the pending continue parsing event + mContinueEvent = nullptr; + mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT; + } + return NS_OK; +} + +//////////////////////////////////////////////////////////////////////// + +/** + * Evalutes EXPR1 and EXPR2 exactly once each, in that order. Stores the value + * of EXPR2 in RV is EXPR2 fails, otherwise RV contains the result of EXPR1 + * (which could be success or failure). + * + * To understand the motivation for this construct, consider these example + * methods: + * + * nsresult nsSomething::DoThatThing(nsIWhatever* obj) { + * nsresult rv = NS_OK; + * ... + * return obj->DoThatThing(); + * NS_ENSURE_SUCCESS(rv, rv); + * ... + * return rv; + * } + * + * void nsCaller::MakeThingsHappen() { + * return mSomething->DoThatThing(mWhatever); + * } + * + * Suppose, for whatever reason*, we want to shift responsibility for calling + * mWhatever->DoThatThing() from nsSomething::DoThatThing up to + * nsCaller::MakeThingsHappen. We might rewrite the two methods as follows: + * + * nsresult nsSomething::DoThatThing() { + * nsresult rv = NS_OK; + * ... + * ... + * return rv; + * } + * + * void nsCaller::MakeThingsHappen() { + * nsresult rv; + * PREFER_LATTER_ERROR_CODE(mSomething->DoThatThing(), + * mWhatever->DoThatThing(), + * rv); + * return rv; + * } + * + * *Possible reasons include: nsCaller doesn't want to give mSomething access + * to mWhatever, nsCaller wants to guarantee that mWhatever->DoThatThing() will + * be called regardless of how nsSomething::DoThatThing behaves, &c. + */ +#define PREFER_LATTER_ERROR_CODE(EXPR1, EXPR2, RV) { \ + nsresult RV##__temp = EXPR1; \ + RV = EXPR2; \ + if (NS_FAILED(RV)) { \ + RV = RV##__temp; \ + } \ +} + +/** + * This gets called just prior to the model actually + * being constructed. It's important to make this the + * last thing that happens right before parsing, so we + * can delay until the last moment the resolution of + * which DTD to use (unless of course we're assigned one). + */ +nsresult +nsParser::WillBuildModel(nsString& aFilename) +{ + if (!mParserContext) + return kInvalidParserContext; + + if (eUnknownDetect != mParserContext->mAutoDetectStatus) + return NS_OK; + + if (eDTDMode_unknown == mParserContext->mDTDMode || + eDTDMode_autodetect == mParserContext->mDTDMode) { + if (mIsAboutBlank) { + mParserContext->mDTDMode = eDTDMode_quirks; + mParserContext->mDocType = eHTML_Quirks; + } else { + mParserContext->mDTDMode = eDTDMode_full_standards; + mParserContext->mDocType = eXML; + } + } // else XML fragment with nested parser context + + NS_ASSERTION(!mDTD || !mParserContext->mPrevContext, + "Clobbering DTD for non-root parser context!"); + mDTD = FindSuitableDTD(*mParserContext); + NS_ENSURE_TRUE(mDTD, NS_ERROR_OUT_OF_MEMORY); + + nsITokenizer* tokenizer; + nsresult rv = mParserContext->GetTokenizer(mDTD, mSink, tokenizer); + NS_ENSURE_SUCCESS(rv, rv); + + rv = mDTD->WillBuildModel(*mParserContext, tokenizer, mSink); + nsresult sinkResult = mSink->WillBuildModel(mDTD->GetMode()); + // nsIDTD::WillBuildModel used to be responsible for calling + // nsIContentSink::WillBuildModel, but that obligation isn't expressible + // in the nsIDTD interface itself, so it's sounder and simpler to give that + // responsibility back to the parser. The former behavior of the DTD was to + // NS_ENSURE_SUCCESS the sink WillBuildModel call, so if the sink returns + // failure we should use sinkResult instead of rv, to preserve the old error + // handling behavior of the DTD: + return NS_FAILED(sinkResult) ? sinkResult : rv; +} + +/** + * This gets called when the parser is done with its input. + * Note that the parser may have been called recursively, so we + * have to check for a prev. context before closing out the DTD/sink. + */ +nsresult +nsParser::DidBuildModel(nsresult anErrorCode) +{ + nsresult result = anErrorCode; + + if (IsComplete()) { + if (mParserContext && !mParserContext->mPrevContext) { + // Let sink know if we're about to end load because we've been terminated. + // In that case we don't want it to run deferred scripts. + bool terminated = mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING; + if (mDTD && mSink) { + nsresult dtdResult = mDTD->DidBuildModel(anErrorCode), + sinkResult = mSink->DidBuildModel(terminated); + // nsIDTD::DidBuildModel used to be responsible for calling + // nsIContentSink::DidBuildModel, but that obligation isn't expressible + // in the nsIDTD interface itself, so it's sounder and simpler to give + // that responsibility back to the parser. The former behavior of the + // DTD was to NS_ENSURE_SUCCESS the sink DidBuildModel call, so if the + // sink returns failure we should use sinkResult instead of dtdResult, + // to preserve the old error handling behavior of the DTD: + result = NS_FAILED(sinkResult) ? sinkResult : dtdResult; + } + + //Ref. to bug 61462. + mParserContext->mRequest = nullptr; + } + } + + return result; +} + +/** + * This method adds a new parser context to the list, + * pushing the current one to the next position. + * + * @param ptr to new context + */ +void +nsParser::PushContext(CParserContext& aContext) +{ + NS_ASSERTION(aContext.mPrevContext == mParserContext, + "Trying to push a context whose previous context differs from " + "the current parser context."); + mParserContext = &aContext; +} + +/** + * This method pops the topmost context off the stack, + * returning it to the user. The next context (if any) + * becomes the current context. + * @update gess7/22/98 + * @return prev. context + */ +CParserContext* +nsParser::PopContext() +{ + CParserContext* oldContext = mParserContext; + if (oldContext) { + mParserContext = oldContext->mPrevContext; + if (mParserContext) { + // If the old context was blocked, propagate the blocked state + // back to the new one. Also, propagate the stream listener state + // but don't override onStop state to guarantee the call to DidBuildModel(). + if (mParserContext->mStreamListenerState != eOnStop) { + mParserContext->mStreamListenerState = oldContext->mStreamListenerState; + } + } + } + return oldContext; +} + +/** + * Call this when you want control whether or not the parser will parse + * and tokenize input (TRUE), or whether it just caches input to be + * parsed later (FALSE). + * + * @param aState determines whether we parse/tokenize or just cache. + * @return current state + */ +void +nsParser::SetUnusedInput(nsString& aBuffer) +{ + mUnusedInput = aBuffer; +} + +/** + * Call this when you want to *force* the parser to terminate the + * parsing process altogether. This is binary -- so once you terminate + * you can't resume without restarting altogether. + */ +NS_IMETHODIMP +nsParser::Terminate(void) +{ + // We should only call DidBuildModel once, so don't do anything if this is + // the second time that Terminate has been called. + if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) { + return NS_OK; + } + + nsresult result = NS_OK; + // XXX - [ until we figure out a way to break parser-sink circularity ] + // Hack - Hold a reference until we are completely done... + nsCOMPtr<nsIParser> kungFuDeathGrip(this); + mInternalState = result = NS_ERROR_HTMLPARSER_STOPPARSING; + + // CancelParsingEvents must be called to avoid leaking the nsParser object + // @see bug 108049 + // If NS_PARSER_FLAG_PENDING_CONTINUE_EVENT is set then CancelParsingEvents + // will reset it so DidBuildModel will call DidBuildModel on the DTD. Note: + // The IsComplete() call inside of DidBuildModel looks at the pendingContinueEvents flag. + CancelParsingEvents(); + + // If we got interrupted in the middle of a document.write, then we might + // have more than one parser context on our parsercontext stack. This has + // the effect of making DidBuildModel a no-op, meaning that we never call + // our sink's DidBuildModel and break the reference cycle, causing a leak. + // Since we're getting terminated, we manually clean up our context stack. + while (mParserContext && mParserContext->mPrevContext) { + CParserContext *prev = mParserContext->mPrevContext; + delete mParserContext; + mParserContext = prev; + } + + if (mDTD) { + mDTD->Terminate(); + DidBuildModel(result); + } else if (mSink) { + // We have no parser context or no DTD yet (so we got terminated before we + // got any data). Manually break the reference cycle with the sink. + result = mSink->DidBuildModel(true); + NS_ENSURE_SUCCESS(result, result); + } + + return NS_OK; +} + +NS_IMETHODIMP +nsParser::ContinueInterruptedParsing() +{ + // If there are scripts executing, then the content sink is jumping the gun + // (probably due to a synchronous XMLHttpRequest) and will re-enable us + // later, see bug 460706. + if (!IsOkToProcessNetworkData()) { + return NS_OK; + } + + // If the stream has already finished, there's a good chance + // that we might start closing things down when the parser + // is reenabled. To make sure that we're not deleted across + // the reenabling process, hold a reference to ourselves. + nsresult result=NS_OK; + nsCOMPtr<nsIParser> kungFuDeathGrip(this); + nsCOMPtr<nsIContentSink> sinkDeathGrip(mSink); + +#ifdef DEBUG + if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) { + NS_WARNING("Don't call ContinueInterruptedParsing on a blocked parser."); + } +#endif + + bool isFinalChunk = mParserContext && + mParserContext->mStreamListenerState == eOnStop; + + mProcessingNetworkData = true; + if (sinkDeathGrip) { + sinkDeathGrip->WillParse(); + } + result = ResumeParse(true, isFinalChunk); // Ref. bug 57999 + mProcessingNetworkData = false; + + if (result != NS_OK) { + result=mInternalState; + } + + return result; +} + +/** + * Stops parsing temporarily. That's it will prevent the + * parser from building up content model. + */ +NS_IMETHODIMP_(void) +nsParser::BlockParser() +{ + mFlags &= ~NS_PARSER_FLAG_PARSER_ENABLED; +} + +/** + * Open up the parser for tokenization, building up content + * model..etc. However, this method does not resume parsing + * automatically. It's the callers' responsibility to restart + * the parsing engine. + */ +NS_IMETHODIMP_(void) +nsParser::UnblockParser() +{ + if (!(mFlags & NS_PARSER_FLAG_PARSER_ENABLED)) { + mFlags |= NS_PARSER_FLAG_PARSER_ENABLED; + } else { + NS_WARNING("Trying to unblock an unblocked parser."); + } +} + +NS_IMETHODIMP_(void) +nsParser::ContinueInterruptedParsingAsync() +{ + mSink->ContinueInterruptedParsingAsync(); +} + +/** + * Call this to query whether the parser is enabled or not. + */ +NS_IMETHODIMP_(bool) +nsParser::IsParserEnabled() +{ + return (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) != 0; +} + +/** + * Call this to query whether the parser thinks it's done with parsing. + */ +NS_IMETHODIMP_(bool) +nsParser::IsComplete() +{ + return !(mFlags & NS_PARSER_FLAG_PENDING_CONTINUE_EVENT); +} + + +void nsParser::HandleParserContinueEvent(nsParserContinueEvent *ev) +{ + // Ignore any revoked continue events... + if (mContinueEvent != ev) + return; + + mFlags &= ~NS_PARSER_FLAG_PENDING_CONTINUE_EVENT; + mContinueEvent = nullptr; + + NS_ASSERTION(IsOkToProcessNetworkData(), + "Interrupted in the middle of a script?"); + ContinueInterruptedParsing(); +} + +bool +nsParser::IsInsertionPointDefined() +{ + return false; +} + +void +nsParser::PushDefinedInsertionPoint() +{ +} + +void +nsParser::PopDefinedInsertionPoint() +{ +} + +void +nsParser::MarkAsNotScriptCreated(const char* aCommand) +{ +} + +bool +nsParser::IsScriptCreated() +{ + return false; +} + +/** + * This is the main controlling routine in the parsing process. + * Note that it may get called multiple times for the same scanner, + * since this is a pushed based system, and all the tokens may + * not have been consumed by the scanner during a given invocation + * of this method. + */ +NS_IMETHODIMP +nsParser::Parse(nsIURI* aURL, + nsIRequestObserver* aListener, + void* aKey, + nsDTDMode aMode) +{ + + NS_PRECONDITION(aURL, "Error: Null URL given"); + + nsresult result=kBadURL; + mObserver = aListener; + + if (aURL) { + nsAutoCString spec; + nsresult rv = aURL->GetSpec(spec); + if (rv != NS_OK) { + return rv; + } + NS_ConvertUTF8toUTF16 theName(spec); + + nsScanner* theScanner = new nsScanner(theName, false); + CParserContext* pc = new CParserContext(mParserContext, theScanner, aKey, + mCommand, aListener); + if (pc && theScanner) { + pc->mMultipart = true; + pc->mContextType = CParserContext::eCTURL; + pc->mDTDMode = aMode; + PushContext(*pc); + + result = NS_OK; + } else { + result = mInternalState = NS_ERROR_HTMLPARSER_BADCONTEXT; + } + } + return result; +} + +/** + * Used by XML fragment parsing below. + * + * @param aSourceBuffer contains a string-full of real content + */ +nsresult +nsParser::Parse(const nsAString& aSourceBuffer, + void* aKey, + bool aLastCall) +{ + nsresult result = NS_OK; + + // Don't bother if we're never going to parse this. + if (mInternalState == NS_ERROR_HTMLPARSER_STOPPARSING) { + return result; + } + + if (!aLastCall && aSourceBuffer.IsEmpty()) { + // Nothing is being passed to the parser so return + // immediately. mUnusedInput will get processed when + // some data is actually passed in. + // But if this is the last call, make sure to finish up + // stuff correctly. + return result; + } + + // Maintain a reference to ourselves so we don't go away + // till we're completely done. + nsCOMPtr<nsIParser> kungFuDeathGrip(this); + + if (aLastCall || !aSourceBuffer.IsEmpty() || !mUnusedInput.IsEmpty()) { + // Note: The following code will always find the parser context associated + // with the given key, even if that context has been suspended (e.g., for + // another document.write call). This doesn't appear to be exactly what IE + // does in the case where this happens, but this makes more sense. + CParserContext* pc = mParserContext; + while (pc && pc->mKey != aKey) { + pc = pc->mPrevContext; + } + + if (!pc) { + // Only make a new context if we don't have one, OR if we do, but has a + // different context key. + nsScanner* theScanner = new nsScanner(mUnusedInput); + NS_ENSURE_TRUE(theScanner, NS_ERROR_OUT_OF_MEMORY); + + eAutoDetectResult theStatus = eUnknownDetect; + + if (mParserContext && + mParserContext->mMimeType.EqualsLiteral("application/xml")) { + // Ref. Bug 90379 + NS_ASSERTION(mDTD, "How come the DTD is null?"); + + if (mParserContext) { + theStatus = mParserContext->mAutoDetectStatus; + // Added this to fix bug 32022. + } + } + + pc = new CParserContext(mParserContext, theScanner, aKey, mCommand, + 0, theStatus, aLastCall); + NS_ENSURE_TRUE(pc, NS_ERROR_OUT_OF_MEMORY); + + PushContext(*pc); + + pc->mMultipart = !aLastCall; // By default + if (pc->mPrevContext) { + pc->mMultipart |= pc->mPrevContext->mMultipart; + } + + // Start fix bug 40143 + if (pc->mMultipart) { + pc->mStreamListenerState = eOnDataAvail; + if (pc->mScanner) { + pc->mScanner->SetIncremental(true); + } + } else { + pc->mStreamListenerState = eOnStop; + if (pc->mScanner) { + pc->mScanner->SetIncremental(false); + } + } + // end fix for 40143 + + pc->mContextType=CParserContext::eCTString; + pc->SetMimeType(NS_LITERAL_CSTRING("application/xml")); + pc->mDTDMode = eDTDMode_full_standards; + + mUnusedInput.Truncate(); + + pc->mScanner->Append(aSourceBuffer); + // Do not interrupt document.write() - bug 95487 + result = ResumeParse(false, false, false); + } else { + pc->mScanner->Append(aSourceBuffer); + if (!pc->mPrevContext) { + // Set stream listener state to eOnStop, on the final context - Fix 68160, + // to guarantee DidBuildModel() call - Fix 36148 + if (aLastCall) { + pc->mStreamListenerState = eOnStop; + pc->mScanner->SetIncremental(false); + } + + if (pc == mParserContext) { + // If pc is not mParserContext, then this call to ResumeParse would + // do the wrong thing and try to continue parsing using + // mParserContext. We need to wait to actually resume parsing on pc. + ResumeParse(false, false, false); + } + } + } + } + + return result; +} + +NS_IMETHODIMP +nsParser::ParseFragment(const nsAString& aSourceBuffer, + nsTArray<nsString>& aTagStack) +{ + nsresult result = NS_OK; + nsAutoString theContext; + uint32_t theCount = aTagStack.Length(); + uint32_t theIndex = 0; + + // Disable observers for fragments + mFlags &= ~NS_PARSER_FLAG_OBSERVERS_ENABLED; + + for (theIndex = 0; theIndex < theCount; theIndex++) { + theContext.Append('<'); + theContext.Append(aTagStack[theCount - theIndex - 1]); + theContext.Append('>'); + } + + if (theCount == 0) { + // Ensure that the buffer is not empty. Because none of the DTDs care + // about leading whitespace, this doesn't change the result. + theContext.Assign(' '); + } + + // First, parse the context to build up the DTD's tag stack. Note that we + // pass false for the aLastCall parameter. + result = Parse(theContext, + (void*)&theContext, + false); + if (NS_FAILED(result)) { + mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED; + return result; + } + + if (!mSink) { + // Parse must have failed in the XML case and so the sink was killed. + return NS_ERROR_HTMLPARSER_STOPPARSING; + } + + nsCOMPtr<nsIFragmentContentSink> fragSink = do_QueryInterface(mSink); + NS_ASSERTION(fragSink, "ParseFragment requires a fragment content sink"); + + fragSink->WillBuildContent(); + // Now, parse the actual content. Note that this is the last call + // for HTML content, but for XML, we will want to build and parse + // the end tags. However, if tagStack is empty, it's the last call + // for XML as well. + if (theCount == 0) { + result = Parse(aSourceBuffer, + &theContext, + true); + fragSink->DidBuildContent(); + } else { + // Add an end tag chunk, so expat will read the whole source buffer, + // and not worry about ']]' etc. + result = Parse(aSourceBuffer + NS_LITERAL_STRING("</"), + &theContext, + false); + fragSink->DidBuildContent(); + + if (NS_SUCCEEDED(result)) { + nsAutoString endContext; + for (theIndex = 0; theIndex < theCount; theIndex++) { + // we already added an end tag chunk above + if (theIndex > 0) { + endContext.AppendLiteral("</"); + } + + nsString& thisTag = aTagStack[theIndex]; + // was there an xmlns=? + int32_t endOfTag = thisTag.FindChar(char16_t(' ')); + if (endOfTag == -1) { + endContext.Append(thisTag); + } else { + endContext.Append(Substring(thisTag,0,endOfTag)); + } + + endContext.Append('>'); + } + + result = Parse(endContext, + &theContext, + true); + } + } + + mFlags |= NS_PARSER_FLAG_OBSERVERS_ENABLED; + + return result; +} + +/** + * This routine is called to cause the parser to continue parsing its + * underlying stream. This call allows the parse process to happen in + * chunks, such as when the content is push based, and we need to parse in + * pieces. + * + * An interesting change in how the parser gets used has led us to add extra + * processing to this method. The case occurs when the parser is blocked in + * one context, and gets a parse(string) call in another context. In this + * case, the parserContexts are linked. No problem. + * + * The problem is that Parse(string) assumes that it can proceed unabated, + * but if the parser is already blocked that assumption is false. So we + * needed to add a mechanism here to allow the parser to continue to process + * (the pop and free) contexts until 1) it get's blocked again; 2) it runs + * out of contexts. + * + * + * @param allowItertion : set to true if non-script resumption is requested + * @param aIsFinalChunk : tells us when the last chunk of data is provided. + * @return error code -- 0 if ok, non-zero if error. + */ +nsresult +nsParser::ResumeParse(bool allowIteration, bool aIsFinalChunk, + bool aCanInterrupt) +{ + nsresult result = NS_OK; + + if ((mFlags & NS_PARSER_FLAG_PARSER_ENABLED) && + mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) { + + result = WillBuildModel(mParserContext->mScanner->GetFilename()); + if (NS_FAILED(result)) { + mFlags &= ~NS_PARSER_FLAG_CAN_TOKENIZE; + return result; + } + + if (mDTD) { + mSink->WillResume(); + bool theIterationIsOk = true; + + while (result == NS_OK && theIterationIsOk) { + if (!mUnusedInput.IsEmpty() && mParserContext->mScanner) { + // -- Ref: Bug# 22485 -- + // Insert the unused input into the source buffer + // as if it was read from the input stream. + // Adding UngetReadable() per vidur!! + mParserContext->mScanner->UngetReadable(mUnusedInput); + mUnusedInput.Truncate(0); + } + + // Only allow parsing to be interrupted in the subsequent call to + // build model. + nsresult theTokenizerResult = (mFlags & NS_PARSER_FLAG_CAN_TOKENIZE) + ? Tokenize(aIsFinalChunk) + : NS_OK; + result = BuildModel(); + + if (result == NS_ERROR_HTMLPARSER_INTERRUPTED && aIsFinalChunk) { + PostContinueEvent(); + } + + theIterationIsOk = theTokenizerResult != kEOF && + result != NS_ERROR_HTMLPARSER_INTERRUPTED; + + // Make sure not to stop parsing too early. Therefore, before shutting + // down the parser, it's important to check whether the input buffer + // has been scanned to completion (theTokenizerResult should be kEOF). + // kEOF -> End of buffer. + + // If we're told to block the parser, we disable all further parsing + // (and cache any data coming in) until the parser is re-enabled. + if (NS_ERROR_HTMLPARSER_BLOCK == result) { + mSink->WillInterrupt(); + if (mFlags & NS_PARSER_FLAG_PARSER_ENABLED) { + // If we were blocked by a recursive invocation, don't re-block. + BlockParser(); + } + return NS_OK; + } + if (NS_ERROR_HTMLPARSER_STOPPARSING == result) { + // Note: Parser Terminate() calls DidBuildModel. + if (mInternalState != NS_ERROR_HTMLPARSER_STOPPARSING) { + DidBuildModel(mStreamStatus); + mInternalState = result; + } + + return NS_OK; + } + if ((NS_OK == result && theTokenizerResult == kEOF) || + result == NS_ERROR_HTMLPARSER_INTERRUPTED) { + bool theContextIsStringBased = + CParserContext::eCTString == mParserContext->mContextType; + + if (mParserContext->mStreamListenerState == eOnStop || + !mParserContext->mMultipart || theContextIsStringBased) { + if (!mParserContext->mPrevContext) { + if (mParserContext->mStreamListenerState == eOnStop) { + DidBuildModel(mStreamStatus); + return NS_OK; + } + } else { + CParserContext* theContext = PopContext(); + if (theContext) { + theIterationIsOk = allowIteration && theContextIsStringBased; + if (theContext->mCopyUnused) { + if (!theContext->mScanner->CopyUnusedData(mUnusedInput)) { + mInternalState = NS_ERROR_OUT_OF_MEMORY; + } + } + + delete theContext; + } + + result = mInternalState; + aIsFinalChunk = mParserContext && + mParserContext->mStreamListenerState == eOnStop; + // ...then intentionally fall through to mSink->WillInterrupt()... + } + } + } + + if (theTokenizerResult == kEOF || + result == NS_ERROR_HTMLPARSER_INTERRUPTED) { + result = (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result; + mSink->WillInterrupt(); + } + } + } else { + mInternalState = result = NS_ERROR_HTMLPARSER_UNRESOLVEDDTD; + } + } + + return (result == NS_ERROR_HTMLPARSER_INTERRUPTED) ? NS_OK : result; +} + +/** + * This is where we loop over the tokens created in the + * tokenization phase, and try to make sense out of them. + */ +nsresult +nsParser::BuildModel() +{ + nsITokenizer* theTokenizer = nullptr; + + nsresult result = NS_OK; + if (mParserContext) { + result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer); + } + + if (NS_SUCCEEDED(result)) { + if (mDTD) { + result = mDTD->BuildModel(theTokenizer, mSink); + } + } else { + mInternalState = result = NS_ERROR_HTMLPARSER_BADTOKENIZER; + } + return result; +} + +/******************************************************************* + These methods are used to talk to the netlib system... + *******************************************************************/ + +nsresult +nsParser::OnStartRequest(nsIRequest *request, nsISupports* aContext) +{ + NS_PRECONDITION(eNone == mParserContext->mStreamListenerState, + "Parser's nsIStreamListener API was not setup " + "correctly in constructor."); + if (mObserver) { + mObserver->OnStartRequest(request, aContext); + } + mParserContext->mStreamListenerState = eOnStart; + mParserContext->mAutoDetectStatus = eUnknownDetect; + mParserContext->mRequest = request; + + NS_ASSERTION(!mParserContext->mPrevContext, + "Clobbering DTD for non-root parser context!"); + mDTD = nullptr; + + nsresult rv; + nsAutoCString contentType; + nsCOMPtr<nsIChannel> channel = do_QueryInterface(request); + if (channel) { + rv = channel->GetContentType(contentType); + if (NS_SUCCEEDED(rv)) { + mParserContext->SetMimeType(contentType); + } + } + + rv = NS_OK; + + return rv; +} + +static bool +ExtractCharsetFromXmlDeclaration(const unsigned char* aBytes, int32_t aLen, + nsCString& oCharset) +{ + // This code is rather pointless to have. Might as well reuse expat as + // seen in nsHtml5StreamParser. -- hsivonen + oCharset.Truncate(); + if ((aLen >= 5) && + ('<' == aBytes[0]) && + ('?' == aBytes[1]) && + ('x' == aBytes[2]) && + ('m' == aBytes[3]) && + ('l' == aBytes[4])) { + int32_t i; + bool versionFound = false, encodingFound = false; + for (i = 6; i < aLen && !encodingFound; ++i) { + // end of XML declaration? + if ((((char*) aBytes)[i] == '?') && + ((i + 1) < aLen) && + (((char*) aBytes)[i + 1] == '>')) { + break; + } + // Version is required. + if (!versionFound) { + // Want to avoid string comparisons, hence looking for 'n' + // and only if found check the string leading to it. Not + // foolproof, but fast. + // The shortest string allowed before this is (strlen==13): + // <?xml version + if ((((char*) aBytes)[i] == 'n') && + (i >= 12) && + (0 == PL_strncmp("versio", (char*) (aBytes + i - 6), 6))) { + // Fast forward through version + char q = 0; + for (++i; i < aLen; ++i) { + char qi = ((char*) aBytes)[i]; + if (qi == '\'' || qi == '"') { + if (q && q == qi) { + // ending quote + versionFound = true; + break; + } else { + // Starting quote + q = qi; + } + } + } + } + } else { + // encoding must follow version + // Want to avoid string comparisons, hence looking for 'g' + // and only if found check the string leading to it. Not + // foolproof, but fast. + // The shortest allowed string before this (strlen==26): + // <?xml version="1" encoding + if ((((char*) aBytes)[i] == 'g') && (i >= 25) && (0 == PL_strncmp( + "encodin", (char*) (aBytes + i - 7), 7))) { + int32_t encStart = 0; + char q = 0; + for (++i; i < aLen; ++i) { + char qi = ((char*) aBytes)[i]; + if (qi == '\'' || qi == '"') { + if (q && q == qi) { + int32_t count = i - encStart; + // encoding value is invalid if it is UTF-16 + if (count > 0 && PL_strncasecmp("UTF-16", + (char*) (aBytes + encStart), count)) { + oCharset.Assign((char*) (aBytes + encStart), count); + } + encodingFound = true; + break; + } else { + encStart = i + 1; + q = qi; + } + } + } + } + } // if (!versionFound) + } // for + } + return !oCharset.IsEmpty(); +} + +inline char +GetNextChar(nsACString::const_iterator& aStart, + nsACString::const_iterator& aEnd) +{ + NS_ASSERTION(aStart != aEnd, "end of buffer"); + return (++aStart != aEnd) ? *aStart : '\0'; +} + +static nsresult +NoOpParserWriteFunc(nsIInputStream* in, + void* closure, + const char* fromRawSegment, + uint32_t toOffset, + uint32_t count, + uint32_t *writeCount) +{ + *writeCount = count; + return NS_OK; +} + +typedef struct { + bool mNeedCharsetCheck; + nsParser* mParser; + nsScanner* mScanner; + nsIRequest* mRequest; +} ParserWriteStruct; + +/* + * This function is invoked as a result of a call to a stream's + * ReadSegments() method. It is called for each contiguous buffer + * of data in the underlying stream or pipe. Using ReadSegments + * allows us to avoid copying data to read out of the stream. + */ +static nsresult +ParserWriteFunc(nsIInputStream* in, + void* closure, + const char* fromRawSegment, + uint32_t toOffset, + uint32_t count, + uint32_t *writeCount) +{ + nsresult result; + ParserWriteStruct* pws = static_cast<ParserWriteStruct*>(closure); + const unsigned char* buf = + reinterpret_cast<const unsigned char*> (fromRawSegment); + uint32_t theNumRead = count; + + if (!pws) { + return NS_ERROR_FAILURE; + } + + if (pws->mNeedCharsetCheck) { + pws->mNeedCharsetCheck = false; + int32_t source; + nsAutoCString preferred; + nsAutoCString maybePrefer; + pws->mParser->GetDocumentCharset(preferred, source); + + // This code was bogus when I found it. It expects the BOM or the XML + // declaration to be entirely in the first network buffer. -- hsivonen + if (nsContentUtils::CheckForBOM(buf, count, maybePrefer)) { + // The decoder will swallow the BOM. The UTF-16 will re-sniff for + // endianness. The value of preferred is now either "UTF-8" or "UTF-16". + preferred.Assign(maybePrefer); + source = kCharsetFromByteOrderMark; + } else if (source < kCharsetFromChannel) { + nsAutoCString declCharset; + + if (ExtractCharsetFromXmlDeclaration(buf, count, declCharset)) { + if (EncodingUtils::FindEncodingForLabel(declCharset, maybePrefer)) { + preferred.Assign(maybePrefer); + source = kCharsetFromMetaTag; + } + } + } + + pws->mParser->SetDocumentCharset(preferred, source); + pws->mParser->SetSinkCharset(preferred); + + } + + result = pws->mScanner->Append(fromRawSegment, theNumRead); + if (NS_SUCCEEDED(result)) { + *writeCount = count; + } + + return result; +} + +nsresult +nsParser::OnDataAvailable(nsIRequest *request, nsISupports* aContext, + nsIInputStream *pIStream, uint64_t sourceOffset, + uint32_t aLength) +{ + NS_PRECONDITION((eOnStart == mParserContext->mStreamListenerState || + eOnDataAvail == mParserContext->mStreamListenerState), + "Error: OnStartRequest() must be called before OnDataAvailable()"); + NS_PRECONDITION(NS_InputStreamIsBuffered(pIStream), + "Must have a buffered input stream"); + + nsresult rv = NS_OK; + + if (mIsAboutBlank) { + MOZ_ASSERT(false, "Must not get OnDataAvailable for about:blank"); + // ... but if an extension tries to feed us data for about:blank in a + // release build, silently ignore the data. + uint32_t totalRead; + rv = pIStream->ReadSegments(NoOpParserWriteFunc, + nullptr, + aLength, + &totalRead); + return rv; + } + + CParserContext *theContext = mParserContext; + + while (theContext && theContext->mRequest != request) { + theContext = theContext->mPrevContext; + } + + if (theContext) { + theContext->mStreamListenerState = eOnDataAvail; + + if (eInvalidDetect == theContext->mAutoDetectStatus) { + if (theContext->mScanner) { + nsScannerIterator iter; + theContext->mScanner->EndReading(iter); + theContext->mScanner->SetPosition(iter, true); + } + } + + uint32_t totalRead; + ParserWriteStruct pws; + pws.mNeedCharsetCheck = true; + pws.mParser = this; + pws.mScanner = theContext->mScanner; + pws.mRequest = request; + + rv = pIStream->ReadSegments(ParserWriteFunc, &pws, aLength, &totalRead); + if (NS_FAILED(rv)) { + return rv; + } + + if (IsOkToProcessNetworkData()) { + nsCOMPtr<nsIParser> kungFuDeathGrip(this); + nsCOMPtr<nsIContentSink> sinkDeathGrip(mSink); + mProcessingNetworkData = true; + if (sinkDeathGrip) { + sinkDeathGrip->WillParse(); + } + rv = ResumeParse(); + mProcessingNetworkData = false; + } + } else { + rv = NS_ERROR_UNEXPECTED; + } + + return rv; +} + +/** + * This is called by the networking library once the last block of data + * has been collected from the net. + */ +nsresult +nsParser::OnStopRequest(nsIRequest *request, nsISupports* aContext, + nsresult status) +{ + nsresult rv = NS_OK; + + CParserContext *pc = mParserContext; + while (pc) { + if (pc->mRequest == request) { + pc->mStreamListenerState = eOnStop; + pc->mScanner->SetIncremental(false); + break; + } + + pc = pc->mPrevContext; + } + + mStreamStatus = status; + + if (IsOkToProcessNetworkData() && NS_SUCCEEDED(rv)) { + mProcessingNetworkData = true; + if (mSink) { + mSink->WillParse(); + } + rv = ResumeParse(true, true); + mProcessingNetworkData = false; + } + + // If the parser isn't enabled, we don't finish parsing till + // it is reenabled. + + + // XXX Should we wait to notify our observers as well if the + // parser isn't yet enabled? + if (mObserver) { + mObserver->OnStopRequest(request, aContext, status); + } + + return rv; +} + + +/******************************************************************* + Here come the tokenization methods... + *******************************************************************/ + + +/** + * Part of the code sandwich, this gets called right before + * the tokenization process begins. The main reason for + * this call is to allow the delegate to do initialization. + */ +bool +nsParser::WillTokenize(bool aIsFinalChunk) +{ + if (!mParserContext) { + return true; + } + + nsITokenizer* theTokenizer; + nsresult result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer); + NS_ENSURE_SUCCESS(result, false); + return NS_SUCCEEDED(theTokenizer->WillTokenize(aIsFinalChunk)); +} + + +/** + * This is the primary control routine to consume tokens. + * It iteratively consumes tokens until an error occurs or + * you run out of data. + */ +nsresult nsParser::Tokenize(bool aIsFinalChunk) +{ + nsITokenizer* theTokenizer; + + nsresult result = NS_ERROR_NOT_AVAILABLE; + if (mParserContext) { + result = mParserContext->GetTokenizer(mDTD, mSink, theTokenizer); + } + + if (NS_SUCCEEDED(result)) { + bool flushTokens = false; + + bool killSink = false; + + WillTokenize(aIsFinalChunk); + while (NS_SUCCEEDED(result)) { + mParserContext->mScanner->Mark(); + result = theTokenizer->ConsumeToken(*mParserContext->mScanner, + flushTokens); + if (NS_FAILED(result)) { + mParserContext->mScanner->RewindToMark(); + if (kEOF == result){ + break; + } + if (NS_ERROR_HTMLPARSER_STOPPARSING == result) { + killSink = true; + result = Terminate(); + break; + } + } else if (flushTokens && (mFlags & NS_PARSER_FLAG_OBSERVERS_ENABLED)) { + // I added the extra test of NS_PARSER_FLAG_OBSERVERS_ENABLED to fix Bug# 23931. + // Flush tokens on seeing </SCRIPT> -- Ref: Bug# 22485 -- + // Also remember to update the marked position. + mFlags |= NS_PARSER_FLAG_FLUSH_TOKENS; + mParserContext->mScanner->Mark(); + break; + } + } + + if (killSink) { + mSink = nullptr; + } + } else { + result = mInternalState = NS_ERROR_HTMLPARSER_BADTOKENIZER; + } + + return result; +} + +/** + * Get the channel associated with this parser + * + * @param aChannel out param that will contain the result + * @return NS_OK if successful + */ +NS_IMETHODIMP +nsParser::GetChannel(nsIChannel** aChannel) +{ + nsresult result = NS_ERROR_NOT_AVAILABLE; + if (mParserContext && mParserContext->mRequest) { + result = CallQueryInterface(mParserContext->mRequest, aChannel); + } + return result; +} + +/** + * Get the DTD associated with this parser + */ +NS_IMETHODIMP +nsParser::GetDTD(nsIDTD** aDTD) +{ + if (mParserContext) { + NS_IF_ADDREF(*aDTD = mDTD); + } + + return NS_OK; +} + +/** + * Get this as nsIStreamListener + */ +nsIStreamListener* +nsParser::GetStreamListener() +{ + return this; +} diff --git a/components/htmlparser/src/nsParser.h b/components/htmlparser/src/nsParser.h new file mode 100644 index 000000000..39bfe03b8 --- /dev/null +++ b/components/htmlparser/src/nsParser.h @@ -0,0 +1,398 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +/** + * MODULE NOTES: + * + * This class does two primary jobs: + * 1) It iterates the tokens provided during the + * tokenization process, identifing where elements + * begin and end (doing validation and normalization). + * 2) It controls and coordinates with an instance of + * the IContentSink interface, to coordinate the + * the production of the content model. + * + * The basic operation of this class assumes that an HTML + * document is non-normalized. Therefore, we don't process + * the document in a normalized way. Don't bother to look + * for methods like: doHead() or doBody(). + * + * Instead, in order to be backward compatible, we must + * scan the set of tokens and perform this basic set of + * operations: + * 1) Determine the token type (easy, since the tokens know) + * 2) Determine the appropriate section of the HTML document + * each token belongs in (HTML,HEAD,BODY,FRAMESET). + * 3) Insert content into our document (via the sink) into + * the correct section. + * 4) In the case of tags that belong in the BODY, we must + * ensure that our underlying document state reflects + * the appropriate context for our tag. + * + * For example,if we see a <TR>, we must ensure our + * document contains a table into which the row can + * be placed. This may result in "implicit containers" + * created to ensure a well-formed document. + * + */ + +#ifndef NS_PARSER__ +#define NS_PARSER__ + +#include "nsIParser.h" +#include "nsDeque.h" +#include "nsIURL.h" +#include "CParserContext.h" +#include "nsParserCIID.h" +#include "nsITokenizer.h" +#include "nsHTMLTags.h" +#include "nsIContentSink.h" +#include "nsCOMArray.h" +#include "nsCycleCollectionParticipant.h" +#include "nsWeakReference.h" + +class nsIDTD; +class nsIRunnable; + +#ifdef _MSC_VER +#pragma warning( disable : 4275 ) +#endif + + +class nsParser final : public nsIParser, + public nsIStreamListener, + public nsSupportsWeakReference +{ + /** + * Destructor + * @update gess5/11/98 + */ + virtual ~nsParser(); + + public: + /** + * Called on module init + */ + static nsresult Init(); + + /** + * Called on module shutdown + */ + static void Shutdown(); + + NS_DECL_CYCLE_COLLECTING_ISUPPORTS + NS_DECL_CYCLE_COLLECTION_CLASS_AMBIGUOUS(nsParser, nsIParser) + + /** + * default constructor + * @update gess5/11/98 + */ + nsParser(); + + /** + * Select given content sink into parser for parser output + * @update gess5/11/98 + * @param aSink is the new sink to be used by parser + * @return old sink, or nullptr + */ + NS_IMETHOD_(void) SetContentSink(nsIContentSink* aSink) override; + + /** + * retrive the sink set into the parser + * @update gess5/11/98 + * @param aSink is the new sink to be used by parser + * @return old sink, or nullptr + */ + NS_IMETHOD_(nsIContentSink*) GetContentSink(void) override; + + /** + * Call this method once you've created a parser, and want to instruct it + * about the command which caused the parser to be constructed. For example, + * this allows us to select a DTD which can do, say, view-source. + * + * @update gess 3/25/98 + * @param aCommand -- ptrs to string that contains command + * @return nada + */ + NS_IMETHOD_(void) GetCommand(nsCString& aCommand) override; + NS_IMETHOD_(void) SetCommand(const char* aCommand) override; + NS_IMETHOD_(void) SetCommand(eParserCommands aParserCommand) override; + + /** + * Call this method once you've created a parser, and want to instruct it + * about what charset to load + * + * @update ftang 4/23/99 + * @param aCharset- the charset of a document + * @param aCharsetSource- the source of the charset + * @return nada + */ + NS_IMETHOD_(void) SetDocumentCharset(const nsACString& aCharset, int32_t aSource) override; + + NS_IMETHOD_(void) GetDocumentCharset(nsACString& aCharset, int32_t& aSource) override + { + aCharset = mCharset; + aSource = mCharsetSource; + } + + /** + * Cause parser to parse input from given URL + * @update gess5/11/98 + * @param aURL is a descriptor for source document + * @param aListener is a listener to forward notifications to + * @return TRUE if all went well -- FALSE otherwise + */ + NS_IMETHOD Parse(nsIURI* aURL, + nsIRequestObserver* aListener = nullptr, + void* aKey = 0, + nsDTDMode aMode = eDTDMode_autodetect) override; + + /** + * This method needs documentation + */ + NS_IMETHOD ParseFragment(const nsAString& aSourceBuffer, + nsTArray<nsString>& aTagStack) override; + + /** + * This method gets called when the tokens have been consumed, and it's time + * to build the model via the content sink. + * @update gess5/11/98 + * @return YES if model building went well -- NO otherwise. + */ + NS_IMETHOD BuildModel(void) override; + + NS_IMETHOD ContinueInterruptedParsing() override; + NS_IMETHOD_(void) BlockParser() override; + NS_IMETHOD_(void) UnblockParser() override; + NS_IMETHOD_(void) ContinueInterruptedParsingAsync() override; + NS_IMETHOD Terminate(void) override; + + /** + * Call this to query whether the parser is enabled or not. + * + * @update vidur 4/12/99 + * @return current state + */ + NS_IMETHOD_(bool) IsParserEnabled() override; + + /** + * Call this to query whether the parser thinks it's done with parsing. + * + * @update rickg 5/12/01 + * @return complete state + */ + NS_IMETHOD_(bool) IsComplete() override; + + /** + * This rather arcane method (hack) is used as a signal between the + * DTD and the parser. It allows the DTD to tell the parser that content + * that comes through (parser::parser(string)) but not consumed should + * propagate into the next string based parse call. + * + * @update gess 9/1/98 + * @param aState determines whether we propagate unused string content. + * @return current state + */ + void SetUnusedInput(nsString& aBuffer); + + /** + * This method gets called (automatically) during incremental parsing + * @update gess5/11/98 + * @return TRUE if all went well, otherwise FALSE + */ + virtual nsresult ResumeParse(bool allowIteration = true, + bool aIsFinalChunk = false, + bool aCanInterrupt = true); + + //********************************************* + // These methods are callback methods used by + // net lib to let us know about our inputstream. + //********************************************* + // nsIRequestObserver methods: + NS_DECL_NSIREQUESTOBSERVER + + // nsIStreamListener methods: + NS_DECL_NSISTREAMLISTENER + + void PushContext(CParserContext& aContext); + CParserContext* PopContext(); + CParserContext* PeekContext() {return mParserContext;} + + /** + * Get the channel associated with this parser + * @update harishd,gagan 07/17/01 + * @param aChannel out param that will contain the result + * @return NS_OK if successful + */ + NS_IMETHOD GetChannel(nsIChannel** aChannel) override; + + /** + * Get the DTD associated with this parser + * @update vidur 9/29/99 + * @param aDTD out param that will contain the result + * @return NS_OK if successful, NS_ERROR_FAILURE for runtime error + */ + NS_IMETHOD GetDTD(nsIDTD** aDTD) override; + + /** + * Get the nsIStreamListener for this parser + */ + virtual nsIStreamListener* GetStreamListener() override; + + void SetSinkCharset(nsACString& aCharset); + + /** + * Removes continue parsing events + * @update kmcclusk 5/18/98 + */ + + NS_IMETHOD CancelParsingEvents() override; + + /** + * Return true. + */ + virtual bool IsInsertionPointDefined() override; + + /** + * No-op. + */ + virtual void PushDefinedInsertionPoint() override; + + /** + * No-op. + */ + virtual void PopDefinedInsertionPoint() override; + + /** + * No-op. + */ + virtual void MarkAsNotScriptCreated(const char* aCommand) override; + + /** + * Always false. + */ + virtual bool IsScriptCreated() override; + + /** + * Set to parser state to indicate whether parsing tokens can be interrupted + * @param aCanInterrupt true if parser can be interrupted, false if it can not be interrupted. + * @update kmcclusk 5/18/98 + */ + void SetCanInterrupt(bool aCanInterrupt); + + /** + * This is called when the final chunk has been + * passed to the parser and the content sink has + * interrupted token processing. It schedules + * a ParserContinue PL_Event which will ask the parser + * to HandleParserContinueEvent when it is handled. + * @update kmcclusk6/1/2001 + */ + nsresult PostContinueEvent(); + + /** + * Fired when the continue parse event is triggered. + * @update kmcclusk 5/18/98 + */ + void HandleParserContinueEvent(class nsParserContinueEvent *); + + virtual void Reset() override { + Cleanup(); + Initialize(); + } + + bool IsScriptExecuting() { + return mSink && mSink->IsScriptExecuting(); + } + + bool IsOkToProcessNetworkData() { + return !IsScriptExecuting() && !mProcessingNetworkData; + } + + protected: + + void Initialize(bool aConstructor = false); + void Cleanup(); + + /** + * + * @update gess5/18/98 + * @param + * @return + */ + nsresult WillBuildModel(nsString& aFilename); + + /** + * + * @update gess5/18/98 + * @param + * @return + */ + nsresult DidBuildModel(nsresult anErrorCode); + +private: + + /******************************************* + These are the tokenization methods... + *******************************************/ + + /** + * Part of the code sandwich, this gets called right before + * the tokenization process begins. The main reason for + * this call is to allow the delegate to do initialization. + * + * @update gess 3/25/98 + * @param + * @return TRUE if it's ok to proceed + */ + bool WillTokenize(bool aIsFinalChunk = false); + + + /** + * This is the primary control routine. It iteratively + * consumes tokens until an error occurs or you run out + * of data. + * + * @update gess 3/25/98 + * @return error code + */ + nsresult Tokenize(bool aIsFinalChunk = false); + + /** + * Pushes XML fragment parsing data to expat without an input stream. + */ + nsresult Parse(const nsAString& aSourceBuffer, + void* aKey, + bool aLastCall); + +protected: + //********************************************* + // And now, some data members... + //********************************************* + + + CParserContext* mParserContext; + nsCOMPtr<nsIDTD> mDTD; + nsCOMPtr<nsIRequestObserver> mObserver; + nsCOMPtr<nsIContentSink> mSink; + nsIRunnable* mContinueEvent; // weak ref + + eParserCommands mCommand; + nsresult mInternalState; + nsresult mStreamStatus; + int32_t mCharsetSource; + + uint16_t mFlags; + + nsString mUnusedInput; + nsCString mCharset; + nsCString mCommandStr; + + bool mProcessingNetworkData; + bool mIsAboutBlank; +}; + +#endif + diff --git a/components/htmlparser/src/nsParserBase.h b/components/htmlparser/src/nsParserBase.h new file mode 100644 index 000000000..83b68c554 --- /dev/null +++ b/components/htmlparser/src/nsParserBase.h @@ -0,0 +1,20 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsParserBase_h_ +#define nsParserBase_h_ + +#include "nsIChannel.h" + +class nsParserBase : public nsISupports +{ + public: + NS_IMETHOD_(bool) IsParserEnabled() { return true; } + NS_IMETHOD GetChannel(nsIChannel** aChannel) { + *aChannel = nullptr; + return NS_OK; + } +}; + +#endif // nsParserBase_h_ diff --git a/components/htmlparser/src/nsParserCIID.h b/components/htmlparser/src/nsParserCIID.h new file mode 100644 index 000000000..4a2b7b1ad --- /dev/null +++ b/components/htmlparser/src/nsParserCIID.h @@ -0,0 +1,39 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsParserCIID_h__ +#define nsParserCIID_h__ + +#include "nsISupports.h" +#include "nsIFactory.h" +#include "nsIComponentManager.h" + +// {2ce606b0-bee6-11d1-aad9-00805f8a3e14} +#define NS_PARSER_CID \ +{ 0x2ce606b0, 0xbee6, 0x11d1, { 0xaa, 0xd9, 0x0, 0x80, 0x5f, 0x8a, 0x3e, 0x14 } } + +// XXX: This object should not be exposed outside of the parser. +// Remove when CNavDTD subclasses do not need access +#define NS_PARSER_NODE_IID \ + {0x9039c670, 0x2717, 0x11d2, \ + {0x92, 0x46, 0x00, 0x80, 0x5f, 0x8a, 0x7a, 0xb6}} + +// {a6cf9107-15b3-11d2-932e-00805f8add32} +#define NS_CNAVDTD_CID \ +{ 0xa6cf9107, 0x15b3, 0x11d2, { 0x93, 0x2e, 0x0, 0x80, 0x5f, 0x8a, 0xdd, 0x32 } } + +// {FFF4FBE9-528A-4b37-819D-FC18F3A401A7} +#define NS_EXPAT_DRIVER_CID \ +{ 0xfff4fbe9, 0x528a, 0x4b37, { 0x81, 0x9d, 0xfc, 0x18, 0xf3, 0xa4, 0x1, 0xa7 } } + +// {a6cf910f-15b3-11d2-932e-00805f8add32} +#define NS_HTMLCONTENTSINKSTREAM_CID \ +{ 0xa6cf910f, 0x15b3, 0x11d2, { 0x93, 0x2e, 0x0, 0x80, 0x5f, 0x8a, 0xdd, 0x32 } } + +// {a6cf9112-15b3-11d2-932e-00805f8add32} +#define NS_PARSERSERVICE_CID \ +{ 0xa6cf9112, 0x15b3, 0x11d2, { 0x93, 0x2e, 0x0, 0x80, 0x5f, 0x8a, 0xdd, 0x32 } } + +#endif diff --git a/components/htmlparser/src/nsParserConstants.h b/components/htmlparser/src/nsParserConstants.h new file mode 100644 index 000000000..2f2373c7f --- /dev/null +++ b/components/htmlparser/src/nsParserConstants.h @@ -0,0 +1,38 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsParserConstants_h_ +#define nsParserConstants_h_ +const char16_t kNewLine = '\n'; +const char16_t kCR = '\r'; +const char16_t kLF = '\n'; +const char16_t kTab = '\t'; +const char16_t kSpace = ' '; +const char16_t kQuote = '"'; +const char16_t kApostrophe = '\''; +const char16_t kLessThan = '<'; +const char16_t kGreaterThan = '>'; +const char16_t kAmpersand = '&'; +const char16_t kForwardSlash = '/'; +const char16_t kBackSlash = '\\'; +const char16_t kEqual = '='; +const char16_t kMinus = '-'; +const char16_t kPlus = '+'; +const char16_t kExclamation = '!'; +const char16_t kSemicolon = ';'; +const char16_t kHashsign = '#'; +const char16_t kAsterisk = '*'; +const char16_t kUnderbar = '_'; +const char16_t kComma = ','; +const char16_t kLeftParen = '('; +const char16_t kRightParen = ')'; +const char16_t kLeftBrace = '{'; +const char16_t kRightBrace = '}'; +const char16_t kQuestionMark = '?'; +const char16_t kLeftSquareBracket = '['; +const char16_t kRightSquareBracket = ']'; +const char16_t kNullCh = '\0'; + +#endif // nsParserConstants_h_ diff --git a/components/htmlparser/src/nsParserModule.cpp b/components/htmlparser/src/nsParserModule.cpp new file mode 100644 index 000000000..00c2d6c56 --- /dev/null +++ b/components/htmlparser/src/nsParserModule.cpp @@ -0,0 +1,107 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsIAtom.h" +#include "nsString.h" +#include "nspr.h" +#include "nsCOMPtr.h" +#include "mozilla/ModuleUtils.h" +#include "nsParserCIID.h" +#include "nsParser.h" +#include "CNavDTD.h" +#include "nsHTMLEntities.h" +#include "nsHTMLTokenizer.h" +//#include "nsTextTokenizer.h" +#include "nsElementTable.h" +#include "nsParserService.h" +#include "nsSAXAttributes.h" +#include "nsSAXLocator.h" +#include "nsSAXXMLReader.h" + +#if defined(DEBUG) +#include "nsExpatDriver.h" +#endif + +//---------------------------------------------------------------------- + +#if defined(DEBUG) +NS_GENERIC_FACTORY_CONSTRUCTOR(nsExpatDriver) +#endif + +NS_GENERIC_FACTORY_CONSTRUCTOR(nsParser) +NS_GENERIC_FACTORY_CONSTRUCTOR(CNavDTD) +NS_GENERIC_FACTORY_CONSTRUCTOR(nsParserService) + +NS_GENERIC_FACTORY_CONSTRUCTOR(nsSAXAttributes) +NS_GENERIC_FACTORY_CONSTRUCTOR(nsSAXXMLReader) + +#if defined(DEBUG) +NS_DEFINE_NAMED_CID(NS_EXPAT_DRIVER_CID); +#endif +NS_DEFINE_NAMED_CID(NS_PARSER_CID); +NS_DEFINE_NAMED_CID(NS_CNAVDTD_CID); +NS_DEFINE_NAMED_CID(NS_PARSERSERVICE_CID); +NS_DEFINE_NAMED_CID(NS_SAXATTRIBUTES_CID); +NS_DEFINE_NAMED_CID(NS_SAXXMLREADER_CID); + +static const mozilla::Module::CIDEntry kParserCIDs[] = { +#if defined(DEBUG) + { &kNS_EXPAT_DRIVER_CID, false, nullptr, nsExpatDriverConstructor }, +#endif + { &kNS_PARSER_CID, false, nullptr, nsParserConstructor }, + { &kNS_CNAVDTD_CID, false, nullptr, CNavDTDConstructor }, + { &kNS_PARSERSERVICE_CID, false, nullptr, nsParserServiceConstructor }, + { &kNS_SAXATTRIBUTES_CID, false, nullptr, nsSAXAttributesConstructor }, + { &kNS_SAXXMLREADER_CID, false, nullptr, nsSAXXMLReaderConstructor }, + { nullptr } +}; + +static const mozilla::Module::ContractIDEntry kParserContracts[] = { + { NS_PARSERSERVICE_CONTRACTID, &kNS_PARSERSERVICE_CID }, + { NS_SAXATTRIBUTES_CONTRACTID, &kNS_SAXATTRIBUTES_CID }, + { NS_SAXXMLREADER_CONTRACTID, &kNS_SAXXMLREADER_CID }, + { nullptr } +}; + +static nsresult +Initialize() +{ + nsresult rv = nsHTMLTags::AddRefTable(); + NS_ENSURE_SUCCESS(rv, rv); + + rv = nsHTMLEntities::AddRefTable(); + if (NS_FAILED(rv)) { + nsHTMLTags::ReleaseTable(); + return rv; + } +#ifdef DEBUG + CheckElementTable(); +#endif + +#ifdef DEBUG + nsHTMLTags::TestTagTable(); +#endif + + return rv; +} + +static void +Shutdown() +{ + nsHTMLTags::ReleaseTable(); + nsHTMLEntities::ReleaseTable(); +} + +static mozilla::Module kParserModule = { + mozilla::Module::kVersion, + kParserCIDs, + kParserContracts, + nullptr, + nullptr, + Initialize, + Shutdown +}; + +NSMODULE_DEFN(nsParserModule) = &kParserModule; diff --git a/components/htmlparser/src/nsParserMsgUtils.cpp b/components/htmlparser/src/nsParserMsgUtils.cpp new file mode 100644 index 000000000..627f57a0e --- /dev/null +++ b/components/htmlparser/src/nsParserMsgUtils.cpp @@ -0,0 +1,65 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsIServiceManager.h" +#include "nsIStringBundle.h" +#include "nsXPIDLString.h" +#include "nsParserMsgUtils.h" +#include "nsNetCID.h" +#include "mozilla/Services.h" + +static nsresult GetBundle(const char * aPropFileName, nsIStringBundle **aBundle) +{ + NS_ENSURE_ARG_POINTER(aPropFileName); + NS_ENSURE_ARG_POINTER(aBundle); + + // Create a bundle for the localization + + nsCOMPtr<nsIStringBundleService> stringService = + mozilla::services::GetStringBundleService(); + if (!stringService) + return NS_ERROR_FAILURE; + + return stringService->CreateBundle(aPropFileName, aBundle); +} + +nsresult +nsParserMsgUtils::GetLocalizedStringByName(const char * aPropFileName, const char* aKey, nsString& oVal) +{ + oVal.Truncate(); + + NS_ENSURE_ARG_POINTER(aKey); + + nsCOMPtr<nsIStringBundle> bundle; + nsresult rv = GetBundle(aPropFileName,getter_AddRefs(bundle)); + if (NS_SUCCEEDED(rv) && bundle) { + nsXPIDLString valUni; + nsAutoString key; key.AssignWithConversion(aKey); + rv = bundle->GetStringFromName(key.get(), getter_Copies(valUni)); + if (NS_SUCCEEDED(rv) && valUni) { + oVal.Assign(valUni); + } + } + + return rv; +} + +nsresult +nsParserMsgUtils::GetLocalizedStringByID(const char * aPropFileName, uint32_t aID, nsString& oVal) +{ + oVal.Truncate(); + + nsCOMPtr<nsIStringBundle> bundle; + nsresult rv = GetBundle(aPropFileName,getter_AddRefs(bundle)); + if (NS_SUCCEEDED(rv) && bundle) { + nsXPIDLString valUni; + rv = bundle->GetStringFromID(aID, getter_Copies(valUni)); + if (NS_SUCCEEDED(rv) && valUni) { + oVal.Assign(valUni); + } + } + + return rv; +} diff --git a/components/htmlparser/src/nsParserMsgUtils.h b/components/htmlparser/src/nsParserMsgUtils.h new file mode 100644 index 000000000..adf3fda8a --- /dev/null +++ b/components/htmlparser/src/nsParserMsgUtils.h @@ -0,0 +1,21 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsParserMsgUtils_h +#define nsParserMsgUtils_h + +#include "nsString.h" + +#define XMLPARSER_PROPERTIES "chrome://global/locale/layout/xmlparser.properties" + +class nsParserMsgUtils { + nsParserMsgUtils(); // Currently this is not meant to be created, use the static methods + ~nsParserMsgUtils(); // If perf required, change this to cache values etc. +public: + static nsresult GetLocalizedStringByName(const char * aPropFileName, const char* aKey, nsString& aVal); + static nsresult GetLocalizedStringByID(const char * aPropFileName, uint32_t aID, nsString& aVal); +}; + +#endif diff --git a/components/htmlparser/src/nsParserService.cpp b/components/htmlparser/src/nsParserService.cpp new file mode 100644 index 000000000..5893f19a9 --- /dev/null +++ b/components/htmlparser/src/nsParserService.cpp @@ -0,0 +1,90 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include "nsError.h" +#include "nsIAtom.h" +#include "nsParserService.h" +#include "nsHTMLEntities.h" +#include "nsElementTable.h" +#include "nsICategoryManager.h" +#include "nsCategoryManagerUtils.h" + +nsParserService::nsParserService() +{ +} + +nsParserService::~nsParserService() +{ +} + +NS_IMPL_ISUPPORTS(nsParserService, nsIParserService) + +int32_t +nsParserService::HTMLAtomTagToId(nsIAtom* aAtom) const +{ + return nsHTMLTags::StringTagToId(nsDependentAtomString(aAtom)); +} + +int32_t +nsParserService::HTMLCaseSensitiveAtomTagToId(nsIAtom* aAtom) const +{ + return nsHTMLTags::CaseSensitiveAtomTagToId(aAtom); +} + +int32_t +nsParserService::HTMLStringTagToId(const nsAString& aTag) const +{ + return nsHTMLTags::StringTagToId(aTag); +} + +const char16_t* +nsParserService::HTMLIdToStringTag(int32_t aId) const +{ + return nsHTMLTags::GetStringValue((nsHTMLTag)aId); +} + +nsIAtom* +nsParserService::HTMLIdToAtomTag(int32_t aId) const +{ + return nsHTMLTags::GetAtom((nsHTMLTag)aId); +} + +NS_IMETHODIMP +nsParserService::HTMLConvertEntityToUnicode(const nsAString& aEntity, + int32_t* aUnicode) const +{ + *aUnicode = nsHTMLEntities::EntityToUnicode(aEntity); + + return NS_OK; +} + +NS_IMETHODIMP +nsParserService::HTMLConvertUnicodeToEntity(int32_t aUnicode, + nsCString& aEntity) const +{ + const char* str = nsHTMLEntities::UnicodeToEntity(aUnicode); + if (str) { + aEntity.Assign(str); + } + + return NS_OK; +} + +NS_IMETHODIMP +nsParserService::IsContainer(int32_t aId, bool& aIsContainer) const +{ + aIsContainer = nsHTMLElement::IsContainer((nsHTMLTag)aId); + + return NS_OK; +} + +NS_IMETHODIMP +nsParserService::IsBlock(int32_t aId, bool& aIsBlock) const +{ + aIsBlock = nsHTMLElement::IsBlock((nsHTMLTag)aId); + + return NS_OK; +} diff --git a/components/htmlparser/src/nsParserService.h b/components/htmlparser/src/nsParserService.h new file mode 100644 index 000000000..0ea7ec98c --- /dev/null +++ b/components/htmlparser/src/nsParserService.h @@ -0,0 +1,58 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ +#ifndef NS_PARSERSERVICE_H__ +#define NS_PARSERSERVICE_H__ + +#include "nsIParserService.h" + +extern "C" int MOZ_XMLIsLetter(const char* ptr); +extern "C" int MOZ_XMLIsNCNameChar(const char* ptr); +/** + * Decodes an entity into the UTF-16 encoding of a Unicode character. If a ';' + * is found between `ptr` and `end` it will try to decode the entity and set + * `*next` to point to the character after the ;. The resulting UTF-16 code + * units will be written in `*result`, so if the entity is a valid numeric + * entity there needs to be space for at least two char16_t at the location + * `result` points to. + * + * @param ptr pointer to the ampersand. + * @param end pointer to the position after the last character of the + * string. + * @param next [out] will be set to the character after the ';' or null if + * the decoding was unsuccessful. + * @param result the buffer to write the resulting UTF-16 character in. + * @return the number of char16_t written to `*result`. + */ +extern "C" int MOZ_XMLTranslateEntity(const char* ptr, const char* end, + const char** next, char16_t* result); + +class nsParserService : public nsIParserService { + virtual ~nsParserService(); + +public: + nsParserService(); + + NS_DECL_ISUPPORTS + + int32_t HTMLAtomTagToId(nsIAtom* aAtom) const override; + + int32_t HTMLCaseSensitiveAtomTagToId(nsIAtom* aAtom) const override; + + int32_t HTMLStringTagToId(const nsAString& aTag) const override; + + const char16_t *HTMLIdToStringTag(int32_t aId) const override; + + nsIAtom *HTMLIdToAtomTag(int32_t aId) const override; + + NS_IMETHOD HTMLConvertEntityToUnicode(const nsAString& aEntity, + int32_t* aUnicode) const override; + NS_IMETHOD HTMLConvertUnicodeToEntity(int32_t aUnicode, + nsCString& aEntity) const override; + NS_IMETHOD IsContainer(int32_t aId, bool& aIsContainer) const override; + NS_IMETHOD IsBlock(int32_t aId, bool& aIsBlock) const override; +}; + +#endif diff --git a/components/htmlparser/src/nsScanner.cpp b/components/htmlparser/src/nsScanner.cpp new file mode 100644 index 000000000..0fa8e43c6 --- /dev/null +++ b/components/htmlparser/src/nsScanner.cpp @@ -0,0 +1,408 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +//#define __INCREMENTAL 1 + +#include "mozilla/Attributes.h" +#include "mozilla/DebugOnly.h" + +#include "nsScanner.h" +#include "nsDebug.h" +#include "nsReadableUtils.h" +#include "nsIInputStream.h" +#include "nsIFile.h" +#include "nsUTF8Utils.h" // for LossyConvertEncoding +#include "nsCRT.h" +#include "nsParser.h" +#include "nsCharsetSource.h" + +#include "mozilla/dom/EncodingUtils.h" + +using mozilla::dom::EncodingUtils; + +nsReadEndCondition::nsReadEndCondition(const char16_t* aTerminateChars) : + mChars(aTerminateChars), mFilter(char16_t(~0)) // All bits set +{ + // Build filter that will be used to filter out characters with + // bits that none of the terminal chars have. This works very well + // because terminal chars often have only the last 4-6 bits set and + // normal ascii letters have bit 7 set. Other letters have even higher + // bits set. + + // Calculate filter + const char16_t *current = aTerminateChars; + char16_t terminalChar = *current; + while (terminalChar) { + mFilter &= ~terminalChar; + ++current; + terminalChar = *current; + } +} + +/** + * Use this constructor if you want i/o to be based on + * a single string you hand in during construction. + * This short cut was added for Javascript. + * + * @update gess 5/12/98 + * @param aMode represents the parser mode (nav, other) + * @return + */ +nsScanner::nsScanner(const nsAString& anHTMLString) +{ + MOZ_COUNT_CTOR(nsScanner); + + mSlidingBuffer = nullptr; + if (AppendToBuffer(anHTMLString)) { + mSlidingBuffer->BeginReading(mCurrentPosition); + } else { + /* XXX see hack below, re: bug 182067 */ + memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); + mEndPosition = mCurrentPosition; + } + mMarkPosition = mCurrentPosition; + mIncremental = false; + mUnicodeDecoder = nullptr; + mCharsetSource = kCharsetUninitialized; +} + +/** + * Use this constructor if you want i/o to be based on strings + * the scanner receives. If you pass a null filename, you + * can still provide data to the scanner via append. + */ +nsScanner::nsScanner(nsString& aFilename, bool aCreateStream) + : mFilename(aFilename) +{ + MOZ_COUNT_CTOR(nsScanner); + NS_ASSERTION(!aCreateStream, "This is always true."); + + mSlidingBuffer = nullptr; + + // XXX This is a big hack. We need to initialize the iterators to something. + // What matters is that mCurrentPosition == mEndPosition, so that our methods + // believe that we are at EOF (see bug 182067). We null out mCurrentPosition + // so that we have some hope of catching null pointer dereferences associated + // with this hack. --darin + memset(&mCurrentPosition, 0, sizeof(mCurrentPosition)); + mMarkPosition = mCurrentPosition; + mEndPosition = mCurrentPosition; + + mIncremental = true; + + mUnicodeDecoder = nullptr; + mCharsetSource = kCharsetUninitialized; + // XML defaults to UTF-8 and about:blank is UTF-8, too. + SetDocumentCharset(NS_LITERAL_CSTRING("UTF-8"), kCharsetFromDocTypeDefault); +} + +nsresult nsScanner::SetDocumentCharset(const nsACString& aCharset , int32_t aSource) +{ + if (aSource < mCharsetSource) // priority is lower than the current one + return NS_OK; + + mCharsetSource = aSource; + + nsCString charsetName; + mozilla::DebugOnly<bool> valid = + EncodingUtils::FindEncodingForLabel(aCharset, charsetName); + MOZ_ASSERT(valid, "Should never call with a bogus aCharset."); + + if (!mCharset.IsEmpty() && charsetName.Equals(mCharset)) { + return NS_OK; // no difference, don't change it + } + + // different, need to change it + + mCharset.Assign(charsetName); + + mUnicodeDecoder = EncodingUtils::DecoderForEncoding(mCharset); + mUnicodeDecoder->SetInputErrorBehavior(nsIUnicodeDecoder::kOnError_Signal); + + return NS_OK; +} + + +/** + * default destructor + * + * @update gess 3/25/98 + * @param + * @return + */ +nsScanner::~nsScanner() { + + delete mSlidingBuffer; + + MOZ_COUNT_DTOR(nsScanner); +} + +/** + * Resets current offset position of input stream to marked position. + * This allows us to back up to this point if the need should arise, + * such as when tokenization gets interrupted. + * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! + * + * @update gess 5/12/98 + * @param + * @return + */ +void nsScanner::RewindToMark(void){ + if (mSlidingBuffer) { + mCurrentPosition = mMarkPosition; + } +} + + +/** + * Records current offset position in input stream. This allows us + * to back up to this point if the need should arise, such as when + * tokenization gets interrupted. + * + * @update gess 7/29/98 + * @param + * @return + */ +int32_t nsScanner::Mark() { + int32_t distance = 0; + if (mSlidingBuffer) { + nsScannerIterator oldStart; + mSlidingBuffer->BeginReading(oldStart); + + distance = Distance(oldStart, mCurrentPosition); + + mSlidingBuffer->DiscardPrefix(mCurrentPosition); + mSlidingBuffer->BeginReading(mCurrentPosition); + mMarkPosition = mCurrentPosition; + } + + return distance; +} + +/** + * Insert data to our underlying input buffer as + * if it were read from an input stream. + * + * @update harishd 01/12/99 + * @return error code + */ +bool nsScanner::UngetReadable(const nsAString& aBuffer) { + if (!mSlidingBuffer) { + return false; + } + + mSlidingBuffer->UngetReadable(aBuffer,mCurrentPosition); + mSlidingBuffer->BeginReading(mCurrentPosition); // Insertion invalidated our iterators + mSlidingBuffer->EndReading(mEndPosition); + + return true; +} + +/** + * Append data to our underlying input buffer as + * if it were read from an input stream. + * + * @update gess4/3/98 + * @return error code + */ +nsresult nsScanner::Append(const nsAString& aBuffer) { + if (!AppendToBuffer(aBuffer)) + return NS_ERROR_OUT_OF_MEMORY; + return NS_OK; +} + +/** + * + * + * @update gess 5/21/98 + * @param + * @return + */ +nsresult nsScanner::Append(const char* aBuffer, uint32_t aLen) +{ + nsresult res = NS_OK; + if (mUnicodeDecoder) { + int32_t unicharBufLen = 0; + + nsresult rv = mUnicodeDecoder->GetMaxLength(aBuffer, aLen, &unicharBufLen); + if (NS_WARN_IF(NS_FAILED(rv))) { + return rv; + } + + nsScannerString::Buffer* buffer = nsScannerString::AllocBuffer(unicharBufLen + 1); + NS_ENSURE_TRUE(buffer,NS_ERROR_OUT_OF_MEMORY); + char16_t *unichars = buffer->DataStart(); + + int32_t totalChars = 0; + int32_t unicharLength = unicharBufLen; + + do { + int32_t srcLength = aLen; + res = mUnicodeDecoder->Convert(aBuffer, &srcLength, unichars, &unicharLength); + + totalChars += unicharLength; + // Continuation of failure case + if(NS_FAILED(res)) { + // if we failed, we consume one byte, replace it with the replacement + // character and try the conversion again. + + // This is only needed because some decoders don't follow the + // nsIUnicodeDecoder contract: they return a failure when *aDestLength + // is 0 rather than the correct NS_OK_UDEC_MOREOUTPUT. See bug 244177 + if ((unichars + unicharLength) >= buffer->DataEnd()) { + NS_ERROR("Unexpected end of destination buffer"); + break; + } + + // Since about:blank is empty, this line runs only for XML. Use a + // character that's illegal in XML instead of U+FFFD in order to make + // expat flag the error. + unichars[unicharLength++] = 0xFFFF; + + unichars = unichars + unicharLength; + unicharLength = unicharBufLen - (++totalChars); + + mUnicodeDecoder->Reset(); + + if(((uint32_t) (srcLength + 1)) > aLen) { + srcLength = aLen; + } + else { + ++srcLength; + } + + aBuffer += srcLength; + aLen -= srcLength; + } + } while (NS_FAILED(res) && (aLen > 0)); + + buffer->SetDataLength(totalChars); + // Don't propagate return code of unicode decoder + // since it doesn't reflect on our success or failure + // - Ref. bug 87110 + res = NS_OK; + if (!AppendToBuffer(buffer)) + res = NS_ERROR_OUT_OF_MEMORY; + } + else { + NS_WARNING("No decoder found."); + res = NS_ERROR_FAILURE; + } + + return res; +} + +/** + * retrieve next char from scanners internal input stream + * + * @update gess 3/25/98 + * @param + * @return error code reflecting read status + */ +nsresult nsScanner::GetChar(char16_t& aChar) { + if (!mSlidingBuffer || mCurrentPosition == mEndPosition) { + aChar = 0; + return kEOF; + } + + aChar = *mCurrentPosition++; + + return NS_OK; +} + + +void nsScanner::BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd) +{ + aSubstring.Rebind(*mSlidingBuffer, aStart, aEnd); +} + +void nsScanner::CurrentPosition(nsScannerIterator& aPosition) +{ + aPosition = mCurrentPosition; +} + +void nsScanner::EndReading(nsScannerIterator& aPosition) +{ + aPosition = mEndPosition; +} + +void nsScanner::SetPosition(nsScannerIterator& aPosition, bool aTerminate) +{ + if (mSlidingBuffer) { + mCurrentPosition = aPosition; + if (aTerminate && (mCurrentPosition == mEndPosition)) { + mMarkPosition = mCurrentPosition; + mSlidingBuffer->DiscardPrefix(mCurrentPosition); + } + } +} + +bool nsScanner::AppendToBuffer(nsScannerString::Buffer* aBuf) +{ + if (!mSlidingBuffer) { + mSlidingBuffer = new nsScannerString(aBuf); + if (!mSlidingBuffer) + return false; + mSlidingBuffer->BeginReading(mCurrentPosition); + mMarkPosition = mCurrentPosition; + mSlidingBuffer->EndReading(mEndPosition); + } + else { + mSlidingBuffer->AppendBuffer(aBuf); + if (mCurrentPosition == mEndPosition) { + mSlidingBuffer->BeginReading(mCurrentPosition); + } + mSlidingBuffer->EndReading(mEndPosition); + } + + return true; +} + +/** + * call this to copy bytes out of the scanner that have not yet been consumed + * by the tokenization process. + * + * @update gess 5/12/98 + * @param aCopyBuffer is where the scanner buffer will be copied to + * @return true if OK or false on OOM + */ +bool nsScanner::CopyUnusedData(nsString& aCopyBuffer) { + if (!mSlidingBuffer) { + aCopyBuffer.Truncate(); + return true; + } + + nsScannerIterator start, end; + start = mCurrentPosition; + end = mEndPosition; + + return CopyUnicodeTo(start, end, aCopyBuffer); +} + +/** + * Retrieve the name of the file that the scanner is reading from. + * In some cases, it's just a given name, because the scanner isn't + * really reading from a file. + * + * @update gess 5/12/98 + * @return + */ +nsString& nsScanner::GetFilename(void) { + return mFilename; +} + +/** + * Conduct self test. Actually, selftesting for this class + * occurs in the parser selftest. + * + * @update gess 3/25/98 + * @param + * @return + */ + +void nsScanner::SelfTest(void) { +#ifdef _DEBUG +#endif +} diff --git a/components/htmlparser/src/nsScanner.h b/components/htmlparser/src/nsScanner.h new file mode 100644 index 000000000..88edcf74e --- /dev/null +++ b/components/htmlparser/src/nsScanner.h @@ -0,0 +1,190 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + + +/** + * MODULE NOTES: + * @update gess 4/1/98 + * + * The scanner is a low-level service class that knows + * how to consume characters out of an (internal) stream. + * This class also offers a series of utility methods + * that most tokenizers want, such as readUntil() + * and SkipWhitespace(). + */ + + +#ifndef SCANNER +#define SCANNER + +#include "nsCOMPtr.h" +#include "nsString.h" +#include "nsIParser.h" +#include "nsIUnicodeDecoder.h" +#include "nsScannerString.h" +#include "mozilla/CheckedInt.h" + +class nsReadEndCondition { +public: + const char16_t *mChars; + char16_t mFilter; + explicit nsReadEndCondition(const char16_t* aTerminateChars); +private: + nsReadEndCondition(const nsReadEndCondition& aOther); // No copying + void operator=(const nsReadEndCondition& aOther); // No assigning +}; + +class nsScanner { + public: + + /** + * Use this constructor for the XML fragment parsing case + */ + explicit nsScanner(const nsAString& anHTMLString); + + /** + * Use this constructor if you want i/o to be based on + * a file (therefore a stream) or just data you provide via Append(). + */ + nsScanner(nsString& aFilename, bool aCreateStream); + + ~nsScanner(); + + /** + * retrieve next char from internal input stream + * + * @update gess 3/25/98 + * @param ch is the char to accept new value + * @return error code reflecting read status + */ + nsresult GetChar(char16_t& ch); + + /** + * Records current offset position in input stream. This allows us + * to back up to this point if the need should arise, such as when + * tokenization gets interrupted. + * + * @update gess 5/12/98 + * @param + * @return + */ + int32_t Mark(void); + + /** + * Resets current offset position of input stream to marked position. + * This allows us to back up to this point if the need should arise, + * such as when tokenization gets interrupted. + * NOTE: IT IS REALLY BAD FORM TO CALL RELEASE WITHOUT CALLING MARK FIRST! + * + * @update gess 5/12/98 + * @param + * @return + */ + void RewindToMark(void); + + + /** + * + * + * @update harishd 01/12/99 + * @param + * @return + */ + bool UngetReadable(const nsAString& aBuffer); + + /** + * + * + * @update gess 5/13/98 + * @param + * @return + */ + nsresult Append(const nsAString& aBuffer); + + /** + * + * + * @update gess 5/21/98 + * @param + * @return + */ + nsresult Append(const char* aBuffer, uint32_t aLen); + + /** + * Call this to copy bytes out of the scanner that have not yet been consumed + * by the tokenization process. + * + * @update gess 5/12/98 + * @param aCopyBuffer is where the scanner buffer will be copied to + * @return true if OK or false on OOM + */ + bool CopyUnusedData(nsString& aCopyBuffer); + + /** + * Retrieve the name of the file that the scanner is reading from. + * In some cases, it's just a given name, because the scanner isn't + * really reading from a file. + * + * @update gess 5/12/98 + * @return + */ + nsString& GetFilename(void); + + static void SelfTest(); + + /** + * Use this setter to change the scanner's unicode decoder + * + * @update ftang 3/02/99 + * @param aCharset a normalized (alias resolved) charset name + * @param aCharsetSource- where the charset info came from + * @return + */ + nsresult SetDocumentCharset(const nsACString& aCharset, int32_t aSource); + + void BindSubstring(nsScannerSubstring& aSubstring, const nsScannerIterator& aStart, const nsScannerIterator& aEnd); + void CurrentPosition(nsScannerIterator& aPosition); + void EndReading(nsScannerIterator& aPosition); + void SetPosition(nsScannerIterator& aPosition, + bool aTruncate = false); + + /** + * Internal method used to cause the internal buffer to + * be filled with data. + * + * @update gess4/3/98 + */ + bool IsIncremental(void) {return mIncremental;} + void SetIncremental(bool anIncrValue) {mIncremental=anIncrValue;} + + protected: + + bool AppendToBuffer(nsScannerString::Buffer* aBuffer); + bool AppendToBuffer(const nsAString& aStr) + { + nsScannerString::Buffer* buf = nsScannerString::AllocBufferFromString(aStr); + if (!buf) + return false; + AppendToBuffer(buf); + return true; + } + + nsScannerString* mSlidingBuffer; + nsScannerIterator mCurrentPosition; // The position we will next read from in the scanner buffer + nsScannerIterator mMarkPosition; // The position last marked (we may rewind to here) + nsScannerIterator mEndPosition; // The current end of the scanner buffer + nsString mFilename; + bool mIncremental; + int32_t mCharsetSource; + nsCString mCharset; + nsCOMPtr<nsIUnicodeDecoder> mUnicodeDecoder; + + private: + nsScanner &operator =(const nsScanner &); // Not implemented. +}; + +#endif + + diff --git a/components/htmlparser/src/nsScannerString.cpp b/components/htmlparser/src/nsScannerString.cpp new file mode 100644 index 000000000..53ac117f1 --- /dev/null +++ b/components/htmlparser/src/nsScannerString.cpp @@ -0,0 +1,650 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#include <stdlib.h> +#include "nsScannerString.h" +#include "mozilla/CheckedInt.h" + + + /** + * nsScannerBufferList + */ + +#define MAX_CAPACITY ((UINT32_MAX / sizeof(char16_t)) - \ + (sizeof(Buffer) + sizeof(char16_t))) + +nsScannerBufferList::Buffer* +nsScannerBufferList::AllocBufferFromString( const nsAString& aString ) + { + uint32_t len = aString.Length(); + Buffer* buf = AllocBuffer(len); + + if (buf) + { + nsAString::const_iterator source; + aString.BeginReading(source); + nsCharTraits<char16_t>::copy(buf->DataStart(), source.get(), len); + } + return buf; + } + +nsScannerBufferList::Buffer* +nsScannerBufferList::AllocBuffer( uint32_t capacity ) + { + if (capacity > MAX_CAPACITY) + return nullptr; + + void* ptr = malloc(sizeof(Buffer) + (capacity + 1) * sizeof(char16_t)); + if (!ptr) + return nullptr; + + Buffer* buf = new (ptr) Buffer(); + + buf->mUsageCount = 0; + buf->mDataEnd = buf->DataStart() + capacity; + + // XXX null terminate. this shouldn't be required, but we do it because + // nsScanner erroneously thinks it can dereference DataEnd :-( + *buf->mDataEnd = char16_t(0); + return buf; + } + +void +nsScannerBufferList::ReleaseAll() + { + while (!mBuffers.isEmpty()) + { + Buffer* node = mBuffers.popFirst(); + //printf(">>> freeing buffer @%p\n", node); + free(node); + } + } + +void +nsScannerBufferList::SplitBuffer( const Position& pos ) + { + // splitting to the right keeps the work string and any extant token + // pointing to and holding a reference count on the same buffer. + + Buffer* bufferToSplit = pos.mBuffer; + NS_ASSERTION(bufferToSplit, "null pointer"); + + uint32_t splitOffset = pos.mPosition - bufferToSplit->DataStart(); + NS_ASSERTION(pos.mPosition >= bufferToSplit->DataStart() && + splitOffset <= bufferToSplit->DataLength(), + "split offset is outside buffer"); + + uint32_t len = bufferToSplit->DataLength() - splitOffset; + Buffer* new_buffer = AllocBuffer(len); + if (new_buffer) + { + nsCharTraits<char16_t>::copy(new_buffer->DataStart(), + bufferToSplit->DataStart() + splitOffset, + len); + InsertAfter(new_buffer, bufferToSplit); + bufferToSplit->SetDataLength(splitOffset); + } + } + +void +nsScannerBufferList::DiscardUnreferencedPrefix( Buffer* aBuf ) + { + if (aBuf == Head()) + { + while (!mBuffers.isEmpty() && !Head()->IsInUse()) + { + Buffer* buffer = Head(); + buffer->remove(); + free(buffer); + } + } + } + +size_t +nsScannerBufferList::Position::Distance( const Position& aStart, const Position& aEnd ) + { + size_t result = 0; + if (aStart.mBuffer == aEnd.mBuffer) + { + result = aEnd.mPosition - aStart.mPosition; + } + else + { + result = aStart.mBuffer->DataEnd() - aStart.mPosition; + for (Buffer* b = aStart.mBuffer->Next(); b != aEnd.mBuffer; b = b->Next()) + result += b->DataLength(); + result += aEnd.mPosition - aEnd.mBuffer->DataStart(); + } + return result; + } + + +/** + * nsScannerSubstring + */ + +nsScannerSubstring::nsScannerSubstring() + : mStart(nullptr, nullptr) + , mEnd(nullptr, nullptr) + , mBufferList(nullptr) + , mLength(0) + , mIsDirty(true) + { + } + +nsScannerSubstring::nsScannerSubstring( const nsAString& s ) + : mBufferList(nullptr) + , mIsDirty(true) + { + Rebind(s); + } + +nsScannerSubstring::~nsScannerSubstring() + { + release_ownership_of_buffer_list(); + } + +int32_t +nsScannerSubstring::CountChar( char16_t c ) const + { + /* + re-write this to use a counting sink + */ + + size_type result = 0; + size_type lengthToExamine = Length(); + + nsScannerIterator iter; + for ( BeginReading(iter); ; ) + { + int32_t lengthToExamineInThisFragment = iter.size_forward(); + const char16_t* fromBegin = iter.get(); + result += size_type(NS_COUNT(fromBegin, fromBegin+lengthToExamineInThisFragment, c)); + if ( !(lengthToExamine -= lengthToExamineInThisFragment) ) + return result; + iter.advance(lengthToExamineInThisFragment); + } + // never reached; quiets warnings + return 0; + } + +void +nsScannerSubstring::Rebind( const nsScannerSubstring& aString, + const nsScannerIterator& aStart, + const nsScannerIterator& aEnd ) + { + // allow for the case where &aString == this + + aString.acquire_ownership_of_buffer_list(); + release_ownership_of_buffer_list(); + + mStart = aStart; + mEnd = aEnd; + mBufferList = aString.mBufferList; + mLength = Distance(aStart, aEnd); + mIsDirty = true; + } + +void +nsScannerSubstring::Rebind( const nsAString& aString ) + { + release_ownership_of_buffer_list(); + + mBufferList = new nsScannerBufferList(AllocBufferFromString(aString)); + mIsDirty = true; + + init_range_from_buffer_list(); + acquire_ownership_of_buffer_list(); + } + +const nsSubstring& +nsScannerSubstring::AsString() const + { + if (mIsDirty) + { + nsScannerSubstring* mutable_this = const_cast<nsScannerSubstring*>(this); + + if (mStart.mBuffer == mEnd.mBuffer) { + // We only have a single fragment to deal with, so just return it + // as a substring. + mutable_this->mFlattenedRep.Rebind(mStart.mPosition, mEnd.mPosition); + } else { + // Otherwise, we need to copy the data into a flattened buffer. + nsScannerIterator start, end; + CopyUnicodeTo(BeginReading(start), EndReading(end), mutable_this->mFlattenedRep); + } + + mutable_this->mIsDirty = false; + } + + return mFlattenedRep; + } + +nsScannerIterator& +nsScannerSubstring::BeginReading( nsScannerIterator& iter ) const + { + iter.mOwner = this; + + iter.mFragment.mBuffer = mStart.mBuffer; + iter.mFragment.mFragmentStart = mStart.mPosition; + if (mStart.mBuffer == mEnd.mBuffer) + iter.mFragment.mFragmentEnd = mEnd.mPosition; + else + iter.mFragment.mFragmentEnd = mStart.mBuffer->DataEnd(); + + iter.mPosition = mStart.mPosition; + iter.normalize_forward(); + return iter; + } + +nsScannerIterator& +nsScannerSubstring::EndReading( nsScannerIterator& iter ) const + { + iter.mOwner = this; + + iter.mFragment.mBuffer = mEnd.mBuffer; + iter.mFragment.mFragmentEnd = mEnd.mPosition; + if (mStart.mBuffer == mEnd.mBuffer) + iter.mFragment.mFragmentStart = mStart.mPosition; + else + iter.mFragment.mFragmentStart = mEnd.mBuffer->DataStart(); + + iter.mPosition = mEnd.mPosition; + // must not |normalize_backward| as that would likely invalidate tests like |while ( first != last )| + return iter; + } + +bool +nsScannerSubstring::GetNextFragment( nsScannerFragment& frag ) const + { + // check to see if we are at the end of the buffer list + if (frag.mBuffer == mEnd.mBuffer) + return false; + + frag.mBuffer = frag.mBuffer->getNext(); + + if (frag.mBuffer == mStart.mBuffer) + frag.mFragmentStart = mStart.mPosition; + else + frag.mFragmentStart = frag.mBuffer->DataStart(); + + if (frag.mBuffer == mEnd.mBuffer) + frag.mFragmentEnd = mEnd.mPosition; + else + frag.mFragmentEnd = frag.mBuffer->DataEnd(); + + return true; + } + +bool +nsScannerSubstring::GetPrevFragment( nsScannerFragment& frag ) const + { + // check to see if we are at the beginning of the buffer list + if (frag.mBuffer == mStart.mBuffer) + return false; + + frag.mBuffer = frag.mBuffer->getPrevious(); + + if (frag.mBuffer == mStart.mBuffer) + frag.mFragmentStart = mStart.mPosition; + else + frag.mFragmentStart = frag.mBuffer->DataStart(); + + if (frag.mBuffer == mEnd.mBuffer) + frag.mFragmentEnd = mEnd.mPosition; + else + frag.mFragmentEnd = frag.mBuffer->DataEnd(); + + return true; + } + + + /** + * nsScannerString + */ + +nsScannerString::nsScannerString( Buffer* aBuf ) + { + mBufferList = new nsScannerBufferList(aBuf); + + init_range_from_buffer_list(); + acquire_ownership_of_buffer_list(); + } + +void +nsScannerString::AppendBuffer( Buffer* aBuf ) + { + mBufferList->Append(aBuf); + mLength += aBuf->DataLength(); + + mEnd.mBuffer = aBuf; + mEnd.mPosition = aBuf->DataEnd(); + + mIsDirty = true; + } + +void +nsScannerString::DiscardPrefix( const nsScannerIterator& aIter ) + { + Position old_start(mStart); + mStart = aIter; + mLength -= Position::Distance(old_start, mStart); + + mStart.mBuffer->IncrementUsageCount(); + old_start.mBuffer->DecrementUsageCount(); + + mBufferList->DiscardUnreferencedPrefix(old_start.mBuffer); + + mIsDirty = true; + } + +void +nsScannerString::UngetReadable( const nsAString& aReadable, const nsScannerIterator& aInsertPoint ) + /* + * Warning: this routine manipulates the shared buffer list in an unexpected way. + * The original design did not really allow for insertions, but this call promises + * that if called for a point after the end of all extant token strings, that no token string + * or the work string will be invalidated. + * + * This routine is protected because it is the responsibility of the derived class to keep those promises. + */ + { + Position insertPos(aInsertPoint); + + mBufferList->SplitBuffer(insertPos); + // splitting to the right keeps the work string and any extant token pointing to and + // holding a reference count on the same buffer + + Buffer* new_buffer = AllocBufferFromString(aReadable); + // make a new buffer with all the data to insert... + // BULLSHIT ALERT: we may have empty space to re-use in the split buffer, measure the cost + // of this and decide if we should do the work to fill it + + Buffer* buffer_to_split = insertPos.mBuffer; + mBufferList->InsertAfter(new_buffer, buffer_to_split); + mLength += aReadable.Length(); + + mEnd.mBuffer = mBufferList->Tail(); + mEnd.mPosition = mEnd.mBuffer->DataEnd(); + + mIsDirty = true; + } + + /** + * nsScannerSharedSubstring + */ + +void +nsScannerSharedSubstring::Rebind(const nsScannerIterator &aStart, + const nsScannerIterator &aEnd) +{ + // If the start and end positions are inside the same buffer, we must + // acquire ownership of the buffer. If not, we can optimize by not holding + // onto it. + + Buffer *buffer = const_cast<Buffer*>(aStart.buffer()); + bool sameBuffer = buffer == aEnd.buffer(); + + nsScannerBufferList *bufferList; + + if (sameBuffer) { + bufferList = aStart.mOwner->mBufferList; + bufferList->AddRef(); + buffer->IncrementUsageCount(); + } + + if (mBufferList) + ReleaseBuffer(); + + if (sameBuffer) { + mBuffer = buffer; + mBufferList = bufferList; + mString.Rebind(aStart.mPosition, aEnd.mPosition); + } else { + mBuffer = nullptr; + mBufferList = nullptr; + CopyUnicodeTo(aStart, aEnd, mString); + } +} + +void +nsScannerSharedSubstring::ReleaseBuffer() +{ + NS_ASSERTION(mBufferList, "Should only be called with non-null mBufferList"); + mBuffer->DecrementUsageCount(); + mBufferList->DiscardUnreferencedPrefix(mBuffer); + mBufferList->Release(); +} + +void +nsScannerSharedSubstring::MakeMutable() +{ + nsString temp(mString); // this will force a copy of the data + mString.Assign(temp); // mString will now share the just-allocated buffer + + ReleaseBuffer(); + + mBuffer = nullptr; + mBufferList = nullptr; +} + + /** + * utils -- based on code from nsReadableUtils.cpp + */ + +// private helper function +static inline +nsAString::iterator& +copy_multifragment_string( nsScannerIterator& first, const nsScannerIterator& last, nsAString::iterator& result ) + { + typedef nsCharSourceTraits<nsScannerIterator> source_traits; + typedef nsCharSinkTraits<nsAString::iterator> sink_traits; + + while ( first != last ) + { + uint32_t distance = source_traits::readable_distance(first, last); + sink_traits::write(result, source_traits::read(first), distance); + NS_ASSERTION(distance > 0, "|copy_multifragment_string| will never terminate"); + source_traits::advance(first, distance); + } + + return result; + } + +bool +CopyUnicodeTo( const nsScannerIterator& aSrcStart, + const nsScannerIterator& aSrcEnd, + nsAString& aDest ) + { + nsAString::iterator writer; + + mozilla::CheckedInt<nsAString::size_type> distance(Distance(aSrcStart, aSrcEnd)); + if (!distance.isValid()) { + return false; // overflow detected + } + + if (!aDest.SetLength(distance.value(), mozilla::fallible)) { + aDest.Truncate(); + return false; // out of memory + } + aDest.BeginWriting(writer); + nsScannerIterator fromBegin(aSrcStart); + + copy_multifragment_string(fromBegin, aSrcEnd, writer); + return true; + } + +bool +AppendUnicodeTo( const nsScannerIterator& aSrcStart, + const nsScannerIterator& aSrcEnd, + nsScannerSharedSubstring& aDest ) + { + // Check whether we can just create a dependent string. + if (aDest.str().IsEmpty()) { + // We can just make |aDest| point to the buffer. + // This will take care of copying if the buffer spans fragments. + aDest.Rebind(aSrcStart, aSrcEnd); + return true; + } + // The dest string is not empty, so it can't be a dependent substring. + return AppendUnicodeTo(aSrcStart, aSrcEnd, aDest.writable()); + } + +bool +AppendUnicodeTo( const nsScannerIterator& aSrcStart, + const nsScannerIterator& aSrcEnd, + nsAString& aDest ) + { + nsAString::iterator writer; + const nsAString::size_type oldLength = aDest.Length(); + mozilla::CheckedInt<nsAString::size_type> newLen(Distance(aSrcStart, aSrcEnd)); + newLen += oldLength; + if (!newLen.isValid()) { + return false; // overflow detected + } + + if (!aDest.SetLength(newLen.value(), mozilla::fallible)) + return false; // out of memory + aDest.BeginWriting(writer).advance(oldLength); + nsScannerIterator fromBegin(aSrcStart); + + copy_multifragment_string(fromBegin, aSrcEnd, writer); + return true; + } + +bool +FindCharInReadable( char16_t aChar, + nsScannerIterator& aSearchStart, + const nsScannerIterator& aSearchEnd ) + { + while ( aSearchStart != aSearchEnd ) + { + int32_t fragmentLength; + if ( SameFragment(aSearchStart, aSearchEnd) ) + fragmentLength = aSearchEnd.get() - aSearchStart.get(); + else + fragmentLength = aSearchStart.size_forward(); + + const char16_t* charFoundAt = nsCharTraits<char16_t>::find(aSearchStart.get(), fragmentLength, aChar); + if ( charFoundAt ) { + aSearchStart.advance( charFoundAt - aSearchStart.get() ); + return true; + } + + aSearchStart.advance(fragmentLength); + } + + return false; + } + +bool +FindInReadable( const nsAString& aPattern, + nsScannerIterator& aSearchStart, + nsScannerIterator& aSearchEnd, + const nsStringComparator& compare ) + { + bool found_it = false; + + // only bother searching at all if we're given a non-empty range to search + if ( aSearchStart != aSearchEnd ) + { + nsAString::const_iterator aPatternStart, aPatternEnd; + aPattern.BeginReading(aPatternStart); + aPattern.EndReading(aPatternEnd); + + // outer loop keeps searching till we find it or run out of string to search + while ( !found_it ) + { + // fast inner loop (that's what it's called, not what it is) looks for a potential match + while ( aSearchStart != aSearchEnd && + compare(aPatternStart.get(), aSearchStart.get(), 1, 1) ) + ++aSearchStart; + + // if we broke out of the `fast' loop because we're out of string ... we're done: no match + if ( aSearchStart == aSearchEnd ) + break; + + // otherwise, we're at a potential match, let's see if we really hit one + nsAString::const_iterator testPattern(aPatternStart); + nsScannerIterator testSearch(aSearchStart); + + // slow inner loop verifies the potential match (found by the `fast' loop) at the current position + for(;;) + { + // we already compared the first character in the outer loop, + // so we'll advance before the next comparison + ++testPattern; + ++testSearch; + + // if we verified all the way to the end of the pattern, then we found it! + if ( testPattern == aPatternEnd ) + { + found_it = true; + aSearchEnd = testSearch; // return the exact found range through the parameters + break; + } + + // if we got to end of the string we're searching before we hit the end of the + // pattern, we'll never find what we're looking for + if ( testSearch == aSearchEnd ) + { + aSearchStart = aSearchEnd; + break; + } + + // else if we mismatched ... it's time to advance to the next search position + // and get back into the `fast' loop + if ( compare(testPattern.get(), testSearch.get(), 1, 1) ) + { + ++aSearchStart; + break; + } + } + } + } + + return found_it; + } + + /** + * This implementation is simple, but does too much work. + * It searches the entire string from left to right, and returns the last match found, if any. + * This implementation will be replaced when I get |reverse_iterator|s working. + */ +bool +RFindInReadable( const nsAString& aPattern, + nsScannerIterator& aSearchStart, + nsScannerIterator& aSearchEnd, + const nsStringComparator& aComparator ) + { + bool found_it = false; + + nsScannerIterator savedSearchEnd(aSearchEnd); + nsScannerIterator searchStart(aSearchStart), searchEnd(aSearchEnd); + + while ( searchStart != searchEnd ) + { + if ( FindInReadable(aPattern, searchStart, searchEnd, aComparator) ) + { + found_it = true; + + // this is the best match so far, so remember it + aSearchStart = searchStart; + aSearchEnd = searchEnd; + + // ...and get ready to search some more + // (it's tempting to set |searchStart=searchEnd| ... but that misses overlapping patterns) + ++searchStart; + searchEnd = savedSearchEnd; + } + } + + // if we never found it, return an empty range + if ( !found_it ) + aSearchStart = aSearchEnd; + + return found_it; + } diff --git a/components/htmlparser/src/nsScannerString.h b/components/htmlparser/src/nsScannerString.h new file mode 100644 index 000000000..247c04c04 --- /dev/null +++ b/components/htmlparser/src/nsScannerString.h @@ -0,0 +1,604 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* vim:set ts=2 sw=2 sts=2 et cindent: */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef nsScannerString_h___ +#define nsScannerString_h___ + +#include "nsString.h" +#include "nsUnicharUtils.h" // for nsCaseInsensitiveStringComparator +#include "mozilla/LinkedList.h" +#include <algorithm> + + + /** + * NOTE: nsScannerString (and the other classes defined in this file) are + * not related to nsAString or any of the other xpcom/string classes. + * + * nsScannerString is based on the nsSlidingString implementation that used + * to live in xpcom/string. Now that nsAString is limited to representing + * only single fragment strings, nsSlidingString can no longer be used. + * + * An advantage to this design is that it does not employ any virtual + * functions. + * + * This file uses SCC-style indenting in deference to the nsSlidingString + * code from which this code is derived ;-) + */ + +class nsScannerIterator; +class nsScannerSubstring; +class nsScannerString; + + + /** + * nsScannerBufferList + * + * This class maintains a list of heap-allocated Buffer objects. The buffers + * are maintained in a circular linked list. Each buffer has a usage count + * that is decremented by the owning nsScannerSubstring. + * + * The buffer list itself is reference counted. This allows the buffer list + * to be shared by multiple nsScannerSubstring objects. The reference + * counting is not threadsafe, which is not at all a requirement. + * + * When a nsScannerSubstring releases its reference to a buffer list, it + * decrements the usage count of the first buffer in the buffer list that it + * was referencing. It informs the buffer list that it can discard buffers + * starting at that prefix. The buffer list will do so if the usage count of + * that buffer is 0 and if it is the first buffer in the list. It will + * continue to prune buffers starting from the front of the buffer list until + * it finds a buffer that has a usage count that is non-zero. + */ +class nsScannerBufferList + { + public: + + /** + * Buffer objects are directly followed by a data segment. The start + * of the data segment is determined by increment the |this| pointer + * by 1 unit. + */ + class Buffer : public mozilla::LinkedListElement<Buffer> + { + public: + + void IncrementUsageCount() { ++mUsageCount; } + void DecrementUsageCount() { --mUsageCount; } + + bool IsInUse() const { return mUsageCount != 0; } + + const char16_t* DataStart() const { return (const char16_t*) (this+1); } + char16_t* DataStart() { return ( char16_t*) (this+1); } + + const char16_t* DataEnd() const { return mDataEnd; } + char16_t* DataEnd() { return mDataEnd; } + + const Buffer* Next() const { return getNext(); } + Buffer* Next() { return getNext(); } + + const Buffer* Prev() const { return getPrevious(); } + Buffer* Prev() { return getPrevious(); } + + uint32_t DataLength() const { return mDataEnd - DataStart(); } + void SetDataLength(uint32_t len) { mDataEnd = DataStart() + len; } + + private: + + friend class nsScannerBufferList; + + int32_t mUsageCount; + char16_t* mDataEnd; + }; + + /** + * Position objects serve as lightweight pointers into a buffer list. + * The mPosition member must be contained with mBuffer->DataStart() + * and mBuffer->DataEnd(). + */ + class Position + { + public: + + Position() {} + + Position( Buffer* buffer, char16_t* position ) + : mBuffer(buffer) + , mPosition(position) + {} + + inline + explicit Position( const nsScannerIterator& aIter ); + + inline + Position& operator=( const nsScannerIterator& aIter ); + + static size_t Distance( const Position& p1, const Position& p2 ); + + Buffer* mBuffer; + char16_t* mPosition; + }; + + static Buffer* AllocBufferFromString( const nsAString& ); + static Buffer* AllocBuffer( uint32_t capacity ); // capacity = number of chars + + explicit nsScannerBufferList( Buffer* buf ) + : mRefCnt(0) + { + mBuffers.insertBack(buf); + } + + void AddRef() { ++mRefCnt; } + void Release() { if (--mRefCnt == 0) delete this; } + + void Append( Buffer* buf ) { mBuffers.insertBack(buf); } + void InsertAfter( Buffer* buf, Buffer* prev ) { prev->setNext(buf); } + void SplitBuffer( const Position& ); + void DiscardUnreferencedPrefix( Buffer* ); + + Buffer* Head() { return mBuffers.getFirst(); } + const Buffer* Head() const { return mBuffers.getFirst(); } + + Buffer* Tail() { return mBuffers.getLast(); } + const Buffer* Tail() const { return mBuffers.getLast(); } + + private: + + friend class nsScannerSubstring; + + ~nsScannerBufferList() { ReleaseAll(); } + void ReleaseAll(); + + int32_t mRefCnt; + mozilla::LinkedList<Buffer> mBuffers; + }; + + + /** + * nsScannerFragment represents a "slice" of a Buffer object. + */ +struct nsScannerFragment + { + typedef nsScannerBufferList::Buffer Buffer; + + const Buffer* mBuffer; + const char16_t* mFragmentStart; + const char16_t* mFragmentEnd; + }; + + + /** + * nsScannerSubstring is the base class for nsScannerString. It provides + * access to iterators and methods to bind the substring to another + * substring or nsAString instance. + * + * This class owns the buffer list. + */ +class nsScannerSubstring + { + public: + typedef nsScannerBufferList::Buffer Buffer; + typedef nsScannerBufferList::Position Position; + typedef uint32_t size_type; + + nsScannerSubstring(); + explicit nsScannerSubstring( const nsAString& s ); + + ~nsScannerSubstring(); + + nsScannerIterator& BeginReading( nsScannerIterator& iter ) const; + nsScannerIterator& EndReading( nsScannerIterator& iter ) const; + + size_type Length() const { return mLength; } + + int32_t CountChar( char16_t ) const; + + void Rebind( const nsScannerSubstring&, const nsScannerIterator&, const nsScannerIterator& ); + void Rebind( const nsAString& ); + + const nsSubstring& AsString() const; + + bool GetNextFragment( nsScannerFragment& ) const; + bool GetPrevFragment( nsScannerFragment& ) const; + + static inline Buffer* AllocBufferFromString( const nsAString& aStr ) { return nsScannerBufferList::AllocBufferFromString(aStr); } + static inline Buffer* AllocBuffer( size_type aCapacity ) { return nsScannerBufferList::AllocBuffer(aCapacity); } + + protected: + + void acquire_ownership_of_buffer_list() const + { + mBufferList->AddRef(); + mStart.mBuffer->IncrementUsageCount(); + } + + void release_ownership_of_buffer_list() + { + if (mBufferList) + { + mStart.mBuffer->DecrementUsageCount(); + mBufferList->DiscardUnreferencedPrefix(mStart.mBuffer); + mBufferList->Release(); + } + } + + void init_range_from_buffer_list() + { + mStart.mBuffer = mBufferList->Head(); + mStart.mPosition = mStart.mBuffer->DataStart(); + + mEnd.mBuffer = mBufferList->Tail(); + mEnd.mPosition = mEnd.mBuffer->DataEnd(); + + mLength = Position::Distance(mStart, mEnd); + } + + Position mStart; + Position mEnd; + nsScannerBufferList *mBufferList; + size_type mLength; + + // these fields are used to implement AsString + nsDependentSubstring mFlattenedRep; + bool mIsDirty; + + friend class nsScannerSharedSubstring; + }; + + + /** + * nsScannerString provides methods to grow and modify a buffer list. + */ +class nsScannerString : public nsScannerSubstring + { + public: + + explicit nsScannerString( Buffer* ); + + // you are giving ownership to the string, it takes and keeps your + // buffer, deleting it when done. + // Use AllocBuffer or AllocBufferFromString to create a Buffer object + // for use with this function. + void AppendBuffer( Buffer* ); + + void DiscardPrefix( const nsScannerIterator& ); + // any other way you want to do this? + + void UngetReadable(const nsAString& aReadable, const nsScannerIterator& aCurrentPosition); + }; + + + /** + * nsScannerSharedSubstring implements copy-on-write semantics for + * nsScannerSubstring. When you call .writable(), it will copy the data + * and return a mutable string object. This class also manages releasing + * the reference to the scanner buffer when it is no longer needed. + */ + +class nsScannerSharedSubstring + { + public: + nsScannerSharedSubstring() + : mBuffer(nullptr), mBufferList(nullptr) { } + + ~nsScannerSharedSubstring() + { + if (mBufferList) + ReleaseBuffer(); + } + + // Acquire a copy-on-write reference to the given substring. + void Rebind(const nsScannerIterator& aStart, + const nsScannerIterator& aEnd); + + // Get a mutable reference to this string + nsSubstring& writable() + { + if (mBufferList) + MakeMutable(); + + return mString; + } + + // Get a const reference to this string + const nsSubstring& str() const { return mString; } + + private: + typedef nsScannerBufferList::Buffer Buffer; + + void ReleaseBuffer(); + void MakeMutable(); + + nsDependentSubstring mString; + Buffer *mBuffer; + nsScannerBufferList *mBufferList; + }; + + /** + * nsScannerIterator works just like nsReadingIterator<CharT> except that + * it knows how to iterate over a list of scanner buffers. + */ +class nsScannerIterator + { + public: + typedef nsScannerIterator self_type; + typedef ptrdiff_t difference_type; + typedef char16_t value_type; + typedef const char16_t* pointer; + typedef const char16_t& reference; + typedef nsScannerSubstring::Buffer Buffer; + + protected: + + nsScannerFragment mFragment; + const char16_t* mPosition; + const nsScannerSubstring* mOwner; + + friend class nsScannerSubstring; + friend class nsScannerSharedSubstring; + + public: + // nsScannerIterator(); // auto-generate default constructor is OK + // nsScannerIterator( const nsScannerIterator& ); // auto-generated copy-constructor OK + // nsScannerIterator& operator=( const nsScannerIterator& ); // auto-generated copy-assignment operator OK + + inline void normalize_forward(); + inline void normalize_backward(); + + pointer get() const + { + return mPosition; + } + + char16_t operator*() const + { + return *get(); + } + + const nsScannerFragment& fragment() const + { + return mFragment; + } + + const Buffer* buffer() const + { + return mFragment.mBuffer; + } + + self_type& operator++() + { + ++mPosition; + normalize_forward(); + return *this; + } + + self_type operator++( int ) + { + self_type result(*this); + ++mPosition; + normalize_forward(); + return result; + } + + self_type& operator--() + { + normalize_backward(); + --mPosition; + return *this; + } + + self_type operator--( int ) + { + self_type result(*this); + normalize_backward(); + --mPosition; + return result; + } + + difference_type size_forward() const + { + return mFragment.mFragmentEnd - mPosition; + } + + difference_type size_backward() const + { + return mPosition - mFragment.mFragmentStart; + } + + self_type& advance( difference_type n ) + { + while ( n > 0 ) + { + difference_type one_hop = std::min(n, size_forward()); + + NS_ASSERTION(one_hop>0, "Infinite loop: can't advance a reading iterator beyond the end of a string"); + // perhaps I should |break| if |!one_hop|? + + mPosition += one_hop; + normalize_forward(); + n -= one_hop; + } + + while ( n < 0 ) + { + normalize_backward(); + difference_type one_hop = std::max(n, -size_backward()); + + NS_ASSERTION(one_hop<0, "Infinite loop: can't advance (backward) a reading iterator beyond the end of a string"); + // perhaps I should |break| if |!one_hop|? + + mPosition += one_hop; + n -= one_hop; + } + + return *this; + } + }; + + +inline +bool +SameFragment( const nsScannerIterator& a, const nsScannerIterator& b ) + { + return a.fragment().mFragmentStart == b.fragment().mFragmentStart; + } + + + /** + * this class is needed in order to make use of the methods in nsAlgorithm.h + */ +template <> +struct nsCharSourceTraits<nsScannerIterator> + { + typedef nsScannerIterator::difference_type difference_type; + + static + uint32_t + readable_distance( const nsScannerIterator& first, const nsScannerIterator& last ) + { + return uint32_t(SameFragment(first, last) ? last.get() - first.get() : first.size_forward()); + } + + static + const nsScannerIterator::value_type* + read( const nsScannerIterator& iter ) + { + return iter.get(); + } + + static + void + advance( nsScannerIterator& s, difference_type n ) + { + s.advance(n); + } + }; + + + /** + * inline methods follow + */ + +inline +void +nsScannerIterator::normalize_forward() + { + while (mPosition == mFragment.mFragmentEnd && mOwner->GetNextFragment(mFragment)) + mPosition = mFragment.mFragmentStart; + } + +inline +void +nsScannerIterator::normalize_backward() + { + while (mPosition == mFragment.mFragmentStart && mOwner->GetPrevFragment(mFragment)) + mPosition = mFragment.mFragmentEnd; + } + +inline +bool +operator==( const nsScannerIterator& lhs, const nsScannerIterator& rhs ) + { + return lhs.get() == rhs.get(); + } + +inline +bool +operator!=( const nsScannerIterator& lhs, const nsScannerIterator& rhs ) + { + return lhs.get() != rhs.get(); + } + + +inline +nsScannerBufferList::Position::Position(const nsScannerIterator& aIter) + : mBuffer(const_cast<Buffer*>(aIter.buffer())) + , mPosition(const_cast<char16_t*>(aIter.get())) + {} + +inline +nsScannerBufferList::Position& +nsScannerBufferList::Position::operator=(const nsScannerIterator& aIter) + { + mBuffer = const_cast<Buffer*>(aIter.buffer()); + mPosition = const_cast<char16_t*>(aIter.get()); + return *this; + } + + + /** + * scanner string utils + * + * These methods mimic the API provided by nsReadableUtils in xpcom/string. + * Here we provide only the methods that the htmlparser module needs. + */ + +inline +size_t +Distance( const nsScannerIterator& aStart, const nsScannerIterator& aEnd ) + { + typedef nsScannerBufferList::Position Position; + return Position::Distance(Position(aStart), Position(aEnd)); + } + +bool +CopyUnicodeTo( const nsScannerIterator& aSrcStart, + const nsScannerIterator& aSrcEnd, + nsAString& aDest ); + +inline +bool +CopyUnicodeTo( const nsScannerSubstring& aSrc, nsAString& aDest ) + { + nsScannerIterator begin, end; + return CopyUnicodeTo(aSrc.BeginReading(begin), aSrc.EndReading(end), aDest); + } + +bool +AppendUnicodeTo( const nsScannerIterator& aSrcStart, + const nsScannerIterator& aSrcEnd, + nsAString& aDest ); + +inline +bool +AppendUnicodeTo( const nsScannerSubstring& aSrc, nsAString& aDest ) + { + nsScannerIterator begin, end; + return AppendUnicodeTo(aSrc.BeginReading(begin), aSrc.EndReading(end), aDest); + } + +bool +AppendUnicodeTo( const nsScannerIterator& aSrcStart, + const nsScannerIterator& aSrcEnd, + nsScannerSharedSubstring& aDest ); + +bool +FindCharInReadable( char16_t aChar, + nsScannerIterator& aStart, + const nsScannerIterator& aEnd ); + +bool +FindInReadable( const nsAString& aPattern, + nsScannerIterator& aStart, + nsScannerIterator& aEnd, + const nsStringComparator& = nsDefaultStringComparator() ); + +bool +RFindInReadable( const nsAString& aPattern, + nsScannerIterator& aStart, + nsScannerIterator& aEnd, + const nsStringComparator& = nsDefaultStringComparator() ); + +inline +bool +CaseInsensitiveFindInReadable( const nsAString& aPattern, + nsScannerIterator& aStart, + nsScannerIterator& aEnd ) + { + return FindInReadable(aPattern, aStart, aEnd, + nsCaseInsensitiveStringComparator()); + } + +#endif // !defined(nsScannerString_h___) diff --git a/components/htmlparser/src/nsToken.h b/components/htmlparser/src/nsToken.h new file mode 100644 index 000000000..6221aca57 --- /dev/null +++ b/components/htmlparser/src/nsToken.h @@ -0,0 +1,19 @@ +/* -*- Mode: C++; tab-width: 2; indent-tabs-mode: nil; c-basic-offset: 2 -*- */ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +#ifndef CTOKEN__ +#define CTOKEN__ + +enum eHTMLTokenTypes { + eToken_unknown=0, + eToken_start=1, eToken_end, eToken_comment, eToken_entity, + eToken_whitespace, eToken_newline, eToken_text, eToken_attribute, + eToken_instruction, eToken_cdatasection, eToken_doctypeDecl, eToken_markupDecl, + eToken_last //make sure this stays the last token... +}; + +#endif + + diff --git a/components/moz.build b/components/moz.build index b854260ef..22bfd42b7 100644 --- a/components/moz.build +++ b/components/moz.build @@ -32,6 +32,7 @@ DIRS += [ 'finalizationwitness', 'formautofill', 'find', + 'htmlparser', 'gfx', 'global', 'handling', |