summaryrefslogtreecommitdiff
path: root/toolkit/components/microformats/test/lib/text.js
blob: fe94dae0a3f44be3dcfb5e04c4e7ae0473f82972 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
/*
	text
	Extracts text string from DOM nodes. Was created to extract text in a whitespace-normalized form.
	It works like a none-CSS aware version of IE's innerText function. DO NOT replace this module
	with functions such as textContent as it will reduce the quality of data provided to the API user.

	Copyright (C) 2010 - 2015 Glenn Jones. All Rights Reserved.
	MIT License: https://raw.github.com/glennjones/microformat-shiv/master/license.txt
	Dependencies  utilities.js, domutils.js
*/


var Modules = (function (modules) {


	modules.text = {

		// normalised or whitespace or whitespacetrimmed
		textFormat: 'whitespacetrimmed',

		// block level tags, used to add line returns
		blockLevelTags: ['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'hr', 'pre', 'table',
			'address', 'article', 'aside', 'blockquote', 'caption', 'col', 'colgroup', 'dd', 'div',
			'dt', 'dir', 'fieldset', 'figcaption', 'figure', 'footer', 'form',  'header', 'hgroup', 'hr',
			'li', 'map', 'menu', 'nav', 'optgroup', 'option', 'section', 'tbody', 'testarea',
			'tfoot', 'th', 'thead', 'tr', 'td', 'ul', 'ol', 'dl', 'details'],

		// tags to exclude
		excludeTags: ['noframe', 'noscript', 'template', 'script', 'style', 'frames', 'frameset'],


		/**
		 * parses the text from the DOM Node
		 *
		 * @param  {DOM Node} node
		 * @param  {String} textFormat
		 * @return {String}
		 */
		parse: function(doc, node, textFormat){
			var out;
			this.textFormat = (textFormat)? textFormat : this.textFormat;
			if(this.textFormat === 'normalised'){
				out = this.walkTreeForText( node );
				if(out !== undefined){
					return this.normalise( doc, out );
				}else{
					return '';
				}
			}else{
			   return this.formatText( doc, modules.domUtils.textContent(node), this.textFormat );
			}
		},


		/**
		 * parses the text from a html string
		 *
		 * @param  {DOM Document} doc
		 * @param  {String} text
		 * @param  {String} textFormat
		 * @return {String}
		 */
		parseText: function( doc, text, textFormat ){
		   var node = modules.domUtils.createNodeWithText( 'div', text );
		   return this.parse( doc, node, textFormat );
		},


		/**
		 * parses the text from a html string - only for whitespace or whitespacetrimmed formats
		 *
		 * @param  {String} text
		 * @param  {String} textFormat
		 * @return {String}
		 */
		formatText: function( doc, text, textFormat ){
		   this.textFormat = (textFormat)? textFormat : this.textFormat;
		   if(text){
			  var out = '',
				  regex = /(<([^>]+)>)/ig;

			  out = text.replace(regex, '');
			  if(this.textFormat === 'whitespacetrimmed') {
				 out = modules.utils.trimWhitespace( out );
			  }

			  //return entities.decode( out, 2 );
			  return modules.domUtils.decodeEntities( doc, out );
		   }else{
			  return '';
		   }
		},


		/**
		 * normalises whitespace in given text
		 *
		 * @param  {String} text
		 * @return {String}
		 */
		normalise: function( doc, text ){
			text = text.replace( /&nbsp;/g, ' ') ;    // exchanges html entity for space into space char
			text = modules.utils.collapseWhiteSpace( text );     // removes linefeeds, tabs and addtional spaces
			text = modules.domUtils.decodeEntities( doc, text );  // decode HTML entities
			text = text.replace( '–', '-' );          // correct dash decoding
			return modules.utils.trim( text );
		},


		/**
		 * walks DOM tree parsing the text from DOM Nodes
		 *
		 * @param  {DOM Node} node
		 * @return {String}
		 */
		walkTreeForText: function( node ) {
			var out = '',
				j = 0;

			if(node.tagName && this.excludeTags.indexOf( node.tagName.toLowerCase() ) > -1){
				return out;
			}

			// if node is a text node get its text
			if(node.nodeType && node.nodeType === 3){
				out += modules.domUtils.getElementText( node );
			}

			// get the text of the child nodes
			if(node.childNodes && node.childNodes.length > 0){
				for (j = 0; j < node.childNodes.length; j++) {
					var text = this.walkTreeForText( node.childNodes[j] );
					if(text !== undefined){
						out += text;
					}
				}
			}

			// if it's a block level tag add an additional space at the end
			if(node.tagName && this.blockLevelTags.indexOf( node.tagName.toLowerCase() ) !== -1){
				out += ' ';
			}

			return (out === '')? undefined : out ;
		}

	};

	return modules;

} (Modules || {}));