beautify-html.js
27.9 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
/*
Style HTML
---------------
Written by Nochum Sossonko, (nsossonko@hotmail.com)
Based on code initially developed by: Einar Lielmanis, <elfz@laacz.lv>
http://jsbeautifier.org
You are free to use this in any way you want, in case you find this useful or working for you.
Usage:
style_html(html_source);
Recent changes:
2013-04-08, bsy-web (bsy.github@gmail.com)
- added: (array) Parser.Utils.inline_token - to hold list of HTML inline elements for identification
- added: (string) new TK_TAG_INLINE token - to handle formatting of inline tags specifically
- added: (bool) Parser.found_leading_whitespace - to help with conditional inline tag formatting
- added: (bool) Parser.found_trailing_whitespace - to help with conditional inline tag formatting
- added: Comments through out the file to help follow flow of data and understand algorithm
- fixed: Space left when trimming whitespace after "<" inside tag (e.g. "< tag>"). Now outputs "<tag>"
- changed: get_token(), get_string, get_content(), and get_tag() only return array[string, string] (previously array[string, string] or just string)
- changed: unfinished content (unclosed tags or scripts) now get formatted even when EOF is reached
- todo: add option to print inline tags on same line as content (inline tags currently get newline if whitespace allows)
- todo: add indentation formatting for nested inline tags (e.g. <em>italic <strong>and bold</strong><em>)
- todo: create test driver script with input test cases to track how changes to this file affect final output (regression testing)
*/
// Wrapper function to invoke all the necessary constructors and deal with the output.
function style_html(html_source, indent_size, indent_character,
max_char, brace_style) {
var Parser, multi_parser;
/* Function: Parser()
This function initializes the parsing environment and contains many other sub-functions
which will be called as needed by an instance of Parser */
function Parser() {
this.pos = 0; //Parser position
this.found_leading_whitespace = false;
this.found_trailing_whitespace = false;
this.token = '';
this.current_mode = 'CONTENT'; //reflects the current Parser mode: TAG/CONTENT
this.tags = { //An object to hold tags, their position, and their parent-tags, initiated with default values
parent: 'parent1',
parentcount: 1,
parent1: ''
};
this.tag_type = '';
this.token_text = this.last_token = this.last_tag_token = this.last_text = this.token_type =
'';
this.Utils = { //Uilities made available to the various functions
// Entities considered white-space
whitespace: "\n\r\t ".split(''),
/*
HTML elements which are considered inline (and thus affected by whitespace such as newlines;
i.e. indentation considerations required for these tags)
SCA, 2013-04-04: Note: Some elements are both inline and single (e.g. br, input, img).
Check how this effects display logic and opt for keeping it in token array instead.
*/
inline_token: 'a,abbr,acronym,b,bdo,big,br,button,cite,code,dfn,em,i,img,input,kbd,label,map,object,q,samp,script,select,small,span,strong,sub,sup,textarea,tt'.split(','),
// HTML elements without a closing tag pair
single_token: '!doctype,?xml,area,base,basefont,br,embed,hr,img,input,isindex,link,meta,param,wbr'.split(','),
// Elements which get get extra blank line before them (stylistic preference)
extra_liners: 'body,head,/html'.split(','), //for tags that need a line of whitespace before them
// Function to check if given element is in given array
in_array: function (what, arr) {
for (var i = 0; i < arr.length; i++) {
if (what === arr[i]) {
return true;
}
}
return false;
}
} /* End of Function: Parser() */
/* Function: get_content() */
this.get_content = function () { //function to capture regular content between tags
var input_char = '';
var content = [];
var space = false; //if a space is needed
// Ingest content until start of new tag ('<')
while (this.input.charAt(this.pos) !== '<') { // content ingestion complete if new tag is starting
if (this.pos >= this.input.length) { // we know we're at EOF...
return [content.join(''),'TK_EOF']; // ...so just pass back any ingested content to get printed
}
input_char = this.input.charAt(this.pos);
this.pos++;
this.line_char_count++;
if (this.Utils.in_array(input_char, this.Utils.whitespace)) {
if (content.length) { // only mark space as true if non-spaces have already been ingested (i.e leading whitespace doesn't count)
space = true; // flag non-leading whitespace encountered (can be used for breaking lines > max_char)
} else { // no content yet hence this is leading whitespace
this.found_leading_whitespace = true; // flag that whitespace came first so we can format inline tags correctly
}
this.line_char_count--;
this.found_trailing_whitespace = true; // flag that last char so far (even if singleton) is whitespace (for formatting inline tags correctly)
continue; // don't want to insert unnecessary space (start next iteration of loop)
}
else if (space) { // current char not a space but previous char may have been (which allows for long-line breaking)
// Insert a newline when the max_char is reached.
if (this.line_char_count >= this.max_char) {
// SCA: Note: This is the place to insert whitespace of choice for the inline tag case (or no newline)
content.push('\n'); // Start newline
// indent per nested tag depth
for (var i = 0; i < this.indent_level; i++) {
content.push(this.indent_string);
}
this.line_char_count = 0;
}
else {
content.push(' '); // put in that space that was left out in previous iteration
this.line_char_count++;
}
space = false;
this.found_trailing_whitespace = false; // last read char not space hence reset trailing space flag
}
content.push(input_char); //letter at-a-time (or string) inserted to an array
}
return [content.join(''), 'TK_CONTENT']; // start of new tag ('<') encountered so return current ingested string
} /* End of Function: get_content() */
/* Function: get_script() */
this.get_script = function () { //get the full content of a script to pass to js_beautify
var input_char = '';
var content = [];
var script_text = '';
var reg_match = new RegExp('\<\/script' + '\>', 'igm');
reg_match.lastIndex = this.pos;
var reg_array = reg_match.exec(this.input);
var end_script = reg_array ? reg_array.index : this.input.length; //absolute end of script
var reached_eof = (this.pos >= this.input.length);
while ((this.pos < end_script) && !reached_eof) { //get everything in between the script tags
input_char = this.input.charAt(this.pos);
this.pos++;
reached_eof = (this.pos >= this.input.length);
content.push(input_char);
}
script_text = !content.length ? '' : // check if we have an content
js_beautify( content.join(''), // apply the JS Beautifier on that content (even if it is partial, i.e. reached EOF)
{
indent_size: this.indent_size,
indent_char: this.indent_character,
indent_level: this.indent_level,
brace_style: this.brace_style
});
// return content.length ? content.join('') : ''; //we might not have any content at all
return [script_text, reached_eof ? 'TK_EOF' : 'TK_CONTENT'];
} /* Function: get_script() */
/* Function: record_tag(tag) */
this.record_tag = function (tag) { //function to record a tag and its parent in this.tags Object
if (this.tags[tag + 'count']) { //check for the existence of this tag type
this.tags[tag + 'count']++;
this.tags[tag + this.tags[tag + 'count']] = this.indent_level; //and record the present indent level
} else { //otherwise initialize this tag type
this.tags[tag + 'count'] = 1;
this.tags[tag + this.tags[tag + 'count']] = this.indent_level; //and record the present indent level
}
this.tags[tag + this.tags[tag + 'count'] + 'parent'] =
this.tags.parent; //set the parent (i.e. in the case of a div this.tags.div1parent)
this.tags.parent = tag + this.tags[tag + 'count']; //and make this the current parent (i.e. in the case of a div 'div1')
} /* End of Function: record_tag(tag) */
/* Function: retrieve_tag(tag) */
this.retrieve_tag = function (tag) { //function to retrieve the opening tag to the corresponding closer
if (this.tags[tag + 'count']) { //if the openener is not in the Object we ignore it
var temp_parent = this.tags.parent; //check to see if it's a closable tag.
while (temp_parent) { //till we reach '' (the initial value);
if (tag + this.tags[tag + 'count'] ===
temp_parent) { //if this is it use it
break;
}
temp_parent = this.tags[temp_parent + 'parent']; //otherwise keep on climbing up the DOM Tree
}
if (temp_parent) { //if we caught something
this.indent_level = this.tags[tag + this.tags[tag +
'count']]; //set the indent_level accordingly
this.tags.parent = this.tags[temp_parent +
'parent']; //and set the current parent
}
delete this.tags[tag + this.tags[tag + 'count'] +
'parent']; //delete the closed tags parent reference...
delete this.tags[tag + this.tags[tag + 'count']]; //...and the tag itself
if (this.tags[tag + 'count'] == 1) {
delete this.tags[tag + 'count'];
} else {
this.tags[tag + 'count']--;
}
}
} /* End of Function: retrieve_tag(tag) */
/* Function: get_tag() */
this.get_tag = function () { //function to get a full tag and parse its type
var input_char = '';
var content = [];
var space = false;
do {
if (this.pos >= this.input.length) { // if we reach end of input (we already have last char)
return [content.join(''), 'TK_EOF']; // pass back anything ingested so far (so not to lose it)
// and flag as reaching EOF
}
input_char = this.input.charAt(this.pos);
this.pos++;
this.line_char_count++;
if (this.Utils.in_array(input_char, this.Utils.whitespace)) { //don't want to insert unnecessary space
space = true;
this.line_char_count--;
continue; // jump to next iteration of loop hence trimming leading white space inside tag (e.g. < p>)
}
// Case for quotes within the tag itself
if (input_char === "'" || input_char === '"') { // start of quoted string
// SCA: VERIFY: What is "!content[1]" supposed to check? Whitespace after '<' should be discarded.
if (!content[1] || content[1] !== '!') { //if we're in a comment, strings don't get treated specially
input_char += this.get_unformatted(input_char); // read everything up to end of single/double quote
space = true; // queue inserting space after quote contents and end quote (a check below prevents it for start quote which follows a '=')
}
}
// Cases where previous space should disappear
if (content[content.length - 1] === '<' || // if last stored char was '<' no space after it (e.g. "< a> -> "<a>)
input_char === '=') {// no space before '=' (e.g. "<a href ='/code/"> -> "<a href='/code/'>)
//input_char !== '>') {
space = false;
}
// Case for determining what do with previous whitespace
if (space && content.length && // if previous character was non-leading whitespace (came after some content)
content[content.length - 1] !== '=' && // if last stored char was "=" no space after it (e.g. "<a href= '/code/"> -> "<a href='/code/'>)
input_char !== '>') { // no space before '>' (e.g. "<a href='/code/" > -> "<a href='/code/'>)
// Note: Space is still true for end quotes
// Allow opening tag to break if longer than max_char
if (this.line_char_count >= this.max_char) {
this.print_newline(false, content);
this.line_char_count = 0;
} else { // append the previously digested space
content.push(' ');
this.line_char_count++;
}
space = false;
}
content.push(input_char); // append current char to tag string
} while (input_char !== '>'); // exit loop at tag closure character
// Arriving here means a complete tag was captured (e.g. "<xxx>")
var tag_complete = content.join('');
var tag_index;
if (tag_complete.indexOf(' ') != -1) { //if there's whitespace, thats where the tag name ends
tag_index = tag_complete.indexOf(' ');
} else { //otherwise go with the tag ending
tag_index = tag_complete.indexOf('>');
}
// Identify the type of tag that has just been ingested
var tag_check = tag_complete.substring(1, tag_index).toLowerCase();
if (tag_complete.charAt(tag_complete.length - 2) === '/' ||
this.Utils.in_array(tag_check, this.Utils.single_token)) { //if this tag name is a single tag type (either in the list or has a closing /)
this.tag_type = 'SINGLE';
}
else if (tag_check === 'script') { //for later script handling
this.record_tag(tag_check);
this.tag_type = 'SCRIPT';
}
else if (tag_check === 'style') { //for future style handling (for now it justs uses get_content)
this.record_tag(tag_check);
this.tag_type = 'STYLE';
}
else if (this.Utils.in_array(tag_check, this.Utils.inline_token)) { // check if current tag is an inline element
var inline_element = this.get_unformatted('</' + tag_check + '>',
tag_complete); //...delegate to get_unformatted function
content.push(inline_element);
this.tag_type = 'INLINE';
}
else if (tag_check.charAt(0) === '!') { //peek for <!-- comment
if (tag_check.indexOf('[if') != -1) { //peek for <!--[if conditional comment
if (tag_complete.indexOf('!IE') != -1) { //this type needs a closing --> so...
var comment = this.get_unformatted('-->',
tag_complete); //...delegate to get_unformatted
content.push(comment);
}
this.tag_type = 'START';
} else if (tag_check.indexOf('[endif') != -1) { //peek for <!--[endif end conditional comment
this.tag_type = 'END';
this.unindent();
} else if (tag_check.indexOf('[cdata[') != -1) { //if it's a <[cdata[ comment...
var comment = this.get_unformatted(']]>',
tag_complete); //...delegate to get_unformatted function
content.push(comment);
this.tag_type = 'SINGLE'; //<![CDATA[ comments are treated like single tags
} else {
var comment = this.get_unformatted('-->',
tag_complete);
content.push(comment);
this.tag_type = 'SINGLE';
}
}
else {
if (tag_check.charAt(0) === '/') { //this tag is a double tag (end tag, eg. </p>) so check for tag-ending
this.retrieve_tag(tag_check.substring(1)); //remove it and all ancestors
this.tag_type = 'END';
}
else { //otherwise it's a start-tag
this.record_tag(tag_check); //push it on the tag stack
this.tag_type = 'START'; // unidentified tags are labeled as a generic opening tag
}
if (this.Utils.in_array(tag_check, this.Utils.extra_liners)) { //check if this double needs an extra line
this.print_newline(true, this.output); // SCA: Note: Why is this here instead of in standard location (final case statement)?
}
}
return [content.join(''), 'TK_TAG_' + this.tag_type]; // will only reach here if '>' was encountered
} /* Function: get_tag() */
/* Function: get_unformatted(delimiter, orig_tag) */
this.get_unformatted = function (delimiter, orig_tag) { //function to return unformatted content in its entirety
if (orig_tag && orig_tag.indexOf(delimiter) != -1) {
return '';
}
var input_char = '';
var content = '';
var space = true;
do {
if (this.pos >= this.input.length) {
return content;
}
input_char = this.input.charAt(this.pos);
this.pos++
if (this.Utils.in_array(input_char, this.Utils.whitespace)) {
if (!space) {
this.line_char_count--;
continue;
}
if (input_char === '\n' || input_char === '\r') {
content += '\n';
for (var i = 0; i < this.indent_level; i++) {
content += this.indent_string;
}
space = false; //...and make sure other indentation is erased
this.line_char_count = 0;
continue;
}
}
content += input_char;
this.line_char_count++;
space = true;
} while (content.indexOf(delimiter) == -1); // repeat digestion of each char until end tag is consumed
return content;
} /* End of Function: get_unformatted(delimiter, orig_tag) */
/* Function: get_token() */
this.get_token = function () { //initial handler for token-retrieval
if (this.last_token === 'TK_TAG_SCRIPT') { //check if we need to format script content (after <script> tag)
return this.get_script();
}
if (this.current_mode === 'CONTENT') {
return this.get_content();
}
if (this.current_mode === 'TAG') {
return this.get_tag();
}
} /* Function: get_token() */
/* Function: printer(js_source, indent_character, indent_size, max_char, brace_style) */
this.printer = function (js_source, indent_character,
indent_size, max_char, brace_style) { //handles input/output and some other printing functions
// Provides default values to arguments
this.input = js_source || ''; //gets the input for the Parser
this.output = [];
this.indent_character = indent_character || ' '; // ' '
this.indent_string = '';
this.indent_size = indent_size || 2;
this.brace_style = brace_style || 'collapse';
this.indent_level = 0;
this.max_char = max_char || 70; //maximum amount of characters per line
this.line_char_count = 0; //count to see if max_char was exceeded
// Create/cache indent unit: indent_character * indent_size
for (var i = 0; i < this.indent_size; i++) {
this.indent_string += this.indent_character;
}
/* Function: print_newline(ignore, arr) */
this.print_newline = function (ignore, arr) {
this.line_char_count = 0;
if (!arr || !arr.length) {
return;
}
if (!ignore) { //we might want the extra line
while (this.Utils.in_array(arr[arr.length - 1],
this.Utils.whitespace)) {
arr.pop();
}
}
arr.push('\n');
for (var i = 0; i < this.indent_level; i++) {
arr.push(this.indent_string);
}
} /* End of Function: print_newline(ignore, arr) */
this.print_token = function (text) {
this.output.push(text);
} /* End of Function: print_newline(ignore, arr) */
this.indent = function () {
this.indent_level++;
} /* End of Function: indent() */
this.unindent = function () {
if (this.indent_level > 0) {
this.indent_level--;
}
} /* End of Function: unindent() */
} /* End of Function: printer(js_source, indent_character, indent_size, max_char, brace_style) */
return this;
}
/*_____________________--------------------_____________________*/
// Create a new Parser
var last_token_had_trailing_whitespace = false;
multi_parser = new Parser(); //wrapping functions Parser
/*
html_source: Orginal source as stored in database. I think Ultimate TinyMCE disables WP parsing of HTML code so that it can reparse it each time CodeMagic is run. As passed into style_html, which is called from CodeMagic.js.
indent_character: Character to use as a single indent (e.g. space, tab, etc). As passed into style_html, which is called from CodeMagic.js.
indent_size: Width of left indent margin. As passed into style_html, which is called from CodeMagic.js.
max_char: Max length of line of source. As passed into style_html, which is called from CodeMagic.js.
brace_style: Unsure how each style is different. As passed into style_html, which is called from CodeMagic.js.
*/
multi_parser.printer(html_source, indent_character, indent_size,
max_char, brace_style); //initialize starting values
/*
Start moving through source token by token until EOF (end of file) is reached.
*/
do {
var t = multi_parser.get_token();
multi_parser.token_text = t[0];
multi_parser.token_type = t[1];
console.log(t[1] + ',\t"' + t[0] + '"');
/*
Handle each token type in a specific way.
*/
switch (multi_parser.token_type) {
case 'TK_TAG_START':
case 'TK_TAG_SCRIPT':
case 'TK_TAG_STYLE':
multi_parser.print_newline(false, multi_parser.output);
multi_parser.print_token(multi_parser.token_text);
multi_parser.indent(); // increase indent level
multi_parser.current_mode = 'CONTENT'; // toggle to check for content now that a tag has been digested
multi_parser.last_tag_token = multi_parser.token_type;
break;
case 'TK_TAG_END':
multi_parser.print_newline(true, multi_parser.output);
multi_parser.print_token(multi_parser.token_text);
multi_parser.current_mode = 'CONTENT'; // toggle to check for content now that a tag has been digested
multi_parser.last_tag_token = multi_parser.token_type;
break;
case 'TK_TAG_SINGLE':
multi_parser.print_newline(false, multi_parser.output);
multi_parser.print_token(multi_parser.token_text);
multi_parser.current_mode = 'CONTENT'; // toggle to check for content now that a tag has been digested
multi_parser.last_tag_token = multi_parser.token_type;
break;
case 'TK_TAG_INLINE':
//SCA: TODO: Apply indentation formatting to nested inline tags (e.g. <em>italic <strong>and bold</strong><em>).
// Currently puts all content as it gets it using get_unformatted. Could call style_html() recursively on it.
if ( // text preceeding and adjacent to inline tag with out space in between
(last_token_had_trailing_whitespace && (multi_parser.last_token === "TK_CONTENT")) || // it seems CONTENT is not guaranteed after a TAG (verified through logging of long HTML samples) hence the explicit condition statements
// empty string between current inline tag and previous tag is not inline
(!multi_parser.last_text && (multi_parser.last_tag_token !== "TK_TAG_INLINE"))
) {
// only time inline tag shoud not be formatted is if preceding text was content with no trailing whitespace
multi_parser.print_newline(false, multi_parser.output);
}
multi_parser.print_token(multi_parser.token_text);
multi_parser.current_mode = 'CONTENT'; // toggle to check for content now that a tag has been digested
multi_parser.last_tag_token = multi_parser.token_type;
break;
case 'TK_EOF': // print anything ingested before exiting (so incomplete tags don't disappear from end of input
case 'TK_CONTENT':
if (multi_parser.token_text !== '') { // check if there is any content
if (multi_parser.last_token === 'TK_TAG_INLINE') { // if the previous tag was an inline element
if (multi_parser.found_leading_whitespace) { // and current content had leading white space
// output whitespace (a space can be added rather than newline in future versions)
multi_parser.print_newline(false, multi_parser.output);
} // else output content adjacent to inline element (i.e. no whitespace)
} else { // previous tag was not a inline element
multi_parser.print_newline(false, multi_parser.output); // output newline
}
multi_parser.print_token(multi_parser.token_text);
}
multi_parser.current_mode = 'TAG'; // toggle to check for a tag now that content has been digested
break;
}
// Remember and reinitialize values for next loop iteration.
multi_parser.last_token = multi_parser.token_type;
multi_parser.last_text = multi_parser.token_text;
last_token_had_trailing_whitespace = multi_parser.found_trailing_whitespace; // needed in "case 'TK_TAG_INLINE'" for inline tag formatting
multi_parser.found_leading_whitespace = false; // reset to false so it can be set to true if encountered for next token
multi_parser.found_trailing_whitespace = false; // reset to false so it can be set to true if encountered for next token
} while (multi_parser.token_type !== 'TK_EOF');
return multi_parser.output.join(''); // return the fully formatted source
}