summaryrefslogtreecommitdiff
path: root/modules/PageMetadata.jsm
diff options
context:
space:
mode:
Diffstat (limited to 'modules/PageMetadata.jsm')
-rw-r--r--modules/PageMetadata.jsm297
1 files changed, 297 insertions, 0 deletions
diff --git a/modules/PageMetadata.jsm b/modules/PageMetadata.jsm
new file mode 100644
index 000000000..820ec38ea
--- /dev/null
+++ b/modules/PageMetadata.jsm
@@ -0,0 +1,297 @@
+/* This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
+
+"use strict";
+
+this.EXPORTED_SYMBOLS = ["PageMetadata"];
+
+const {classes: Cc, interfaces: Ci, utils: Cu, results: Cr} = Components;
+
+Cu.import("resource://gre/modules/Services.jsm");
+Cu.import("resource://gre/modules/XPCOMUtils.jsm");
+Cu.import("resource://gre/modules/microformat-shiv.js");
+
+XPCOMUtils.defineLazyServiceGetter(this, "UnescapeService",
+ "@mozilla.org/feed-unescapehtml;1",
+ "nsIScriptableUnescapeHTML");
+
+
+/**
+ * Maximum number of images to discover in the document, when no preview images
+ * are explicitly specified by the metadata.
+ * @type {Number}
+ */
+const DISCOVER_IMAGES_MAX = 5;
+
+
+/**
+ * Extract metadata and microformats from a HTML document.
+ * @type {Object}
+ */
+this.PageMetadata = {
+ /**
+ * Get all metadata from an HTML document. This includes:
+ * - URL
+ * - title
+ * - Metadata specified in <meta> tags, including OpenGraph data
+ * - Links specified in <link> tags (short, canonical, preview images, alternative)
+ * - Content that can be found in the page content that we consider useful metadata
+ * - Microformats
+ *
+ * @param {Document} document - Document to extract data from.
+ * @param {Element} [target] - Optional element to restrict microformats lookup to.
+ * @returns {Object} Object containing the various metadata, normalized to
+ * merge some common alternative names for metadata.
+ */
+ getData(document, target = null) {
+ let result = {
+ url: this._validateURL(document, document.documentURI),
+ title: document.title,
+ previews: [],
+ };
+
+ // if pushState was used to change the url, most likely all meta data is
+ // invalid. This is the case with several major sites that rely on
+ // pushState. In that case, we'll only return uri and title. If document is
+ // via XHR or something, there is no view or history.
+ if (document.defaultView) {
+ let docshell = document.defaultView.QueryInterface(Ci.nsIInterfaceRequestor)
+ .getInterface(Ci.nsIWebNavigation)
+ .QueryInterface(Ci.nsIDocShell);
+ let shentry = {};
+ if (docshell.getCurrentSHEntry(shentry) &&
+ shentry.value && shentry.value.URIWasModified) {
+ return result;
+ }
+ }
+
+ this._getMetaData(document, result);
+ this._getLinkData(document, result);
+ this._getPageData(document, result);
+ result.microformats = this.getMicroformats(document, target);
+
+ return result;
+ },
+
+ getMicroformats(document, target = null) {
+ if (target) {
+ return Microformats.getParent(target, {node: document});
+ }
+ return Microformats.get({node: document});
+ },
+
+ /**
+ * Get metadata as defined in <meta> tags.
+ * This adds properties to an existing result object.
+ *
+ * @param {Document} document - Document to extract data from.
+ * @param {Object} result - Existing result object to add properties to.
+ */
+ _getMetaData(document, result) {
+ // Query for standardized meta data.
+ let elements = document.querySelectorAll("head > meta[property], head > meta[name]");
+ if (elements.length < 1) {
+ return;
+ }
+
+ for (let element of elements) {
+ let value = element.getAttribute("content")
+ if (!value) {
+ continue;
+ }
+ value = UnescapeService.unescape(value.trim());
+
+ let key = element.getAttribute("property") || element.getAttribute("name");
+ if (!key) {
+ continue;
+ }
+
+ // There are a wide array of possible meta tags, expressing articles,
+ // products, etc. so all meta tags are passed through but we touch up the
+ // most common attributes.
+ result[key] = value;
+
+ switch (key) {
+ case "title":
+ case "og:title": {
+ // Only set the title if one hasn't already been obtained (e.g. from the
+ // document title element).
+ if (!result.title) {
+ result.title = value;
+ }
+ break;
+ }
+
+ case "description":
+ case "og:description": {
+ result.description = value;
+ break;
+ }
+
+ case "og:site_name": {
+ result.siteName = value;
+ break;
+ }
+
+ case "medium":
+ case "og:type": {
+ result.medium = value;
+ break;
+ }
+
+ case "og:video": {
+ let url = this._validateURL(document, value);
+ if (url) {
+ result.source = url;
+ }
+ break;
+ }
+
+ case "og:url": {
+ let url = this._validateURL(document, value);
+ if (url) {
+ result.url = url;
+ }
+ break;
+ }
+
+ case "og:image": {
+ let url = this._validateURL(document, value);
+ if (url) {
+ result.previews.push(url);
+ }
+ break;
+ }
+ }
+ }
+ },
+
+ /**
+ * Get metadata as defined in <link> tags.
+ * This adds properties to an existing result object.
+ *
+ * @param {Document} document - Document to extract data from.
+ * @param {Object} result - Existing result object to add properties to.
+ */
+ _getLinkData: function(document, result) {
+ let elements = document.querySelectorAll("head > link[rel], head > link[id]");
+
+ for (let element of elements) {
+ let url = element.getAttribute("href");
+ if (!url) {
+ continue;
+ }
+ url = this._validateURL(document, UnescapeService.unescape(url.trim()));
+
+ let key = element.getAttribute("rel") || element.getAttribute("id");
+ if (!key) {
+ continue;
+ }
+
+ switch (key) {
+ case "shorturl":
+ case "shortlink": {
+ result.shortUrl = url;
+ break;
+ }
+
+ case "canonicalurl":
+ case "canonical": {
+ result.url = url;
+ break;
+ }
+
+ case "image_src": {
+ result.previews.push(url);
+ break;
+ }
+
+ case "alternate": {
+ // Expressly for oembed support but we're liberal here and will let
+ // other alternate links through. oembed defines an href, supplied by
+ // the site, where you can fetch additional meta data about a page.
+ // We'll let the client fetch the oembed data themselves, but they
+ // need the data from this link.
+ if (!result.alternate) {
+ result.alternate = [];
+ }
+
+ result.alternate.push({
+ type: element.getAttribute("type"),
+ href: element.getAttribute("href"),
+ title: element.getAttribute("title")
+ });
+ }
+ }
+ }
+ },
+
+ /**
+ * Scrape thought the page content for additional content that may be used to
+ * suppliment explicitly defined metadata. This includes:
+ * - First few images, when no preview image metadata is explicitly defined.
+ *
+ * This adds properties to an existing result object.
+ *
+ * @param {Document} document - Document to extract data from.
+ * @param {Object} result - Existing result object to add properties to.
+ */
+ _getPageData(document, result) {
+ if (result.previews.length < 1) {
+ result.previews = this._getImageUrls(document);
+ }
+ },
+
+ /**
+ * Find the first few images in a document, for use as preview images.
+ * Will return upto DISCOVER_IMAGES_MAX number of images.
+ *
+ * @note This is not very clever. It does not (yet) check if any of the
+ * images may be appropriate as a preview image.
+ *
+ * @param {Document} document - Document to extract data from.
+ * @return {[string]} Array of URLs.
+ */
+ _getImageUrls(document) {
+ let result = [];
+ let elements = document.querySelectorAll("img");
+
+ for (let element of elements) {
+ let src = element.getAttribute("src");
+ if (src) {
+ result.push(this._validateURL(document, UnescapeService.unescape(src)));
+
+ // We don't want a billion images.
+ // TODO: Move this magic number to a const.
+ if (result.length > DISCOVER_IMAGES_MAX) {
+ break;
+ }
+ }
+ }
+
+ return result;
+ },
+
+ /**
+ * Validate a URL. This involves resolving the URL if it's relative to the
+ * document location, ensuring it's using an expected scheme, and stripping
+ * the userPass portion of the URL.
+ *
+ * @param {Document} document - Document to use as the root location for a relative URL.
+ * @param {string} url - URL to validate.
+ * @return {string} Result URL.
+ */
+ _validateURL(document, url) {
+ let docURI = Services.io.newURI(document.documentURI, null, null);
+ let uri = Services.io.newURI(docURI.resolve(url), null, null);
+
+ if (["http", "https"].indexOf(uri.scheme) < 0) {
+ return null;
+ }
+
+ uri.userPass = "";
+
+ return uri.spec;
+ },
+};