diff options
Diffstat (limited to 'modules/PageMetadata.jsm')
-rw-r--r-- | modules/PageMetadata.jsm | 297 |
1 files changed, 297 insertions, 0 deletions
diff --git a/modules/PageMetadata.jsm b/modules/PageMetadata.jsm new file mode 100644 index 000000000..820ec38ea --- /dev/null +++ b/modules/PageMetadata.jsm @@ -0,0 +1,297 @@ +/* This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ + +"use strict"; + +this.EXPORTED_SYMBOLS = ["PageMetadata"]; + +const {classes: Cc, interfaces: Ci, utils: Cu, results: Cr} = Components; + +Cu.import("resource://gre/modules/Services.jsm"); +Cu.import("resource://gre/modules/XPCOMUtils.jsm"); +Cu.import("resource://gre/modules/microformat-shiv.js"); + +XPCOMUtils.defineLazyServiceGetter(this, "UnescapeService", + "@mozilla.org/feed-unescapehtml;1", + "nsIScriptableUnescapeHTML"); + + +/** + * Maximum number of images to discover in the document, when no preview images + * are explicitly specified by the metadata. + * @type {Number} + */ +const DISCOVER_IMAGES_MAX = 5; + + +/** + * Extract metadata and microformats from a HTML document. + * @type {Object} + */ +this.PageMetadata = { + /** + * Get all metadata from an HTML document. This includes: + * - URL + * - title + * - Metadata specified in <meta> tags, including OpenGraph data + * - Links specified in <link> tags (short, canonical, preview images, alternative) + * - Content that can be found in the page content that we consider useful metadata + * - Microformats + * + * @param {Document} document - Document to extract data from. + * @param {Element} [target] - Optional element to restrict microformats lookup to. + * @returns {Object} Object containing the various metadata, normalized to + * merge some common alternative names for metadata. + */ + getData(document, target = null) { + let result = { + url: this._validateURL(document, document.documentURI), + title: document.title, + previews: [], + }; + + // if pushState was used to change the url, most likely all meta data is + // invalid. This is the case with several major sites that rely on + // pushState. In that case, we'll only return uri and title. If document is + // via XHR or something, there is no view or history. + if (document.defaultView) { + let docshell = document.defaultView.QueryInterface(Ci.nsIInterfaceRequestor) + .getInterface(Ci.nsIWebNavigation) + .QueryInterface(Ci.nsIDocShell); + let shentry = {}; + if (docshell.getCurrentSHEntry(shentry) && + shentry.value && shentry.value.URIWasModified) { + return result; + } + } + + this._getMetaData(document, result); + this._getLinkData(document, result); + this._getPageData(document, result); + result.microformats = this.getMicroformats(document, target); + + return result; + }, + + getMicroformats(document, target = null) { + if (target) { + return Microformats.getParent(target, {node: document}); + } + return Microformats.get({node: document}); + }, + + /** + * Get metadata as defined in <meta> tags. + * This adds properties to an existing result object. + * + * @param {Document} document - Document to extract data from. + * @param {Object} result - Existing result object to add properties to. + */ + _getMetaData(document, result) { + // Query for standardized meta data. + let elements = document.querySelectorAll("head > meta[property], head > meta[name]"); + if (elements.length < 1) { + return; + } + + for (let element of elements) { + let value = element.getAttribute("content") + if (!value) { + continue; + } + value = UnescapeService.unescape(value.trim()); + + let key = element.getAttribute("property") || element.getAttribute("name"); + if (!key) { + continue; + } + + // There are a wide array of possible meta tags, expressing articles, + // products, etc. so all meta tags are passed through but we touch up the + // most common attributes. + result[key] = value; + + switch (key) { + case "title": + case "og:title": { + // Only set the title if one hasn't already been obtained (e.g. from the + // document title element). + if (!result.title) { + result.title = value; + } + break; + } + + case "description": + case "og:description": { + result.description = value; + break; + } + + case "og:site_name": { + result.siteName = value; + break; + } + + case "medium": + case "og:type": { + result.medium = value; + break; + } + + case "og:video": { + let url = this._validateURL(document, value); + if (url) { + result.source = url; + } + break; + } + + case "og:url": { + let url = this._validateURL(document, value); + if (url) { + result.url = url; + } + break; + } + + case "og:image": { + let url = this._validateURL(document, value); + if (url) { + result.previews.push(url); + } + break; + } + } + } + }, + + /** + * Get metadata as defined in <link> tags. + * This adds properties to an existing result object. + * + * @param {Document} document - Document to extract data from. + * @param {Object} result - Existing result object to add properties to. + */ + _getLinkData: function(document, result) { + let elements = document.querySelectorAll("head > link[rel], head > link[id]"); + + for (let element of elements) { + let url = element.getAttribute("href"); + if (!url) { + continue; + } + url = this._validateURL(document, UnescapeService.unescape(url.trim())); + + let key = element.getAttribute("rel") || element.getAttribute("id"); + if (!key) { + continue; + } + + switch (key) { + case "shorturl": + case "shortlink": { + result.shortUrl = url; + break; + } + + case "canonicalurl": + case "canonical": { + result.url = url; + break; + } + + case "image_src": { + result.previews.push(url); + break; + } + + case "alternate": { + // Expressly for oembed support but we're liberal here and will let + // other alternate links through. oembed defines an href, supplied by + // the site, where you can fetch additional meta data about a page. + // We'll let the client fetch the oembed data themselves, but they + // need the data from this link. + if (!result.alternate) { + result.alternate = []; + } + + result.alternate.push({ + type: element.getAttribute("type"), + href: element.getAttribute("href"), + title: element.getAttribute("title") + }); + } + } + } + }, + + /** + * Scrape thought the page content for additional content that may be used to + * suppliment explicitly defined metadata. This includes: + * - First few images, when no preview image metadata is explicitly defined. + * + * This adds properties to an existing result object. + * + * @param {Document} document - Document to extract data from. + * @param {Object} result - Existing result object to add properties to. + */ + _getPageData(document, result) { + if (result.previews.length < 1) { + result.previews = this._getImageUrls(document); + } + }, + + /** + * Find the first few images in a document, for use as preview images. + * Will return upto DISCOVER_IMAGES_MAX number of images. + * + * @note This is not very clever. It does not (yet) check if any of the + * images may be appropriate as a preview image. + * + * @param {Document} document - Document to extract data from. + * @return {[string]} Array of URLs. + */ + _getImageUrls(document) { + let result = []; + let elements = document.querySelectorAll("img"); + + for (let element of elements) { + let src = element.getAttribute("src"); + if (src) { + result.push(this._validateURL(document, UnescapeService.unescape(src))); + + // We don't want a billion images. + // TODO: Move this magic number to a const. + if (result.length > DISCOVER_IMAGES_MAX) { + break; + } + } + } + + return result; + }, + + /** + * Validate a URL. This involves resolving the URL if it's relative to the + * document location, ensuring it's using an expected scheme, and stripping + * the userPass portion of the URL. + * + * @param {Document} document - Document to use as the root location for a relative URL. + * @param {string} url - URL to validate. + * @return {string} Result URL. + */ + _validateURL(document, url) { + let docURI = Services.io.newURI(document.documentURI, null, null); + let uri = Services.io.newURI(docURI.resolve(url), null, null); + + if (["http", "https"].indexOf(uri.scheme) < 0) { + return null; + } + + uri.userPass = ""; + + return uri.spec; + }, +}; |