1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
|
<?php
// https://github.com/mihaeu/html-formatter
namespace Mihaeu;
class HtmlFormatter
{
/**
* Formats HTML by re-indenting the tags and removing unnecessary whitespace.
*
* @param string $html HTML string.
* @param string $indentWith Characters that are being used for indentation (default = 4 spaces).
* @param string $tagsWithoutIndentation Comma-separated list of HTML tags that should not be indented (default = html,link,img,meta)
* @return string Re-indented HTML.
*/
public static function format($html, $indentWith = ' ', $tagsWithoutIndentation = 'html,link,img,meta')
{
// replace newlines (CRLF and LF), followed by a non-whitespace character, with a space
$html = preg_replace('/\\r?\\n([^\s])/', ' $1', $html);
// remove all remaining line feeds and replace tabs with spaces
$html = str_replace(["\n", "\r", "\t"], ['', '', ' '], $html);
$elements = preg_split('/(<.+>)/U', $html, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
$dom = self::parseDom($elements);
$indent = 0;
$output = array();
foreach ($dom as $index => $element)
{
if ($element['opening'])
{
$output[] = "\n".str_repeat($indentWith, $indent).trim($element['content']);
// make sure that only the elements who have not been blacklisted are being indented
if ( ! in_array($element['type'], explode(',', $tagsWithoutIndentation)))
{
++$indent;
}
}
else if ($element['standalone'])
{
$output[] = "\n".str_repeat($indentWith, $indent).trim($element['content']);
}
else if ($element['closing'])
{
--$indent;
$lf = "\n".str_repeat($indentWith, abs($indent));
if (isset($dom[$index - 1]) && $dom[$index - 1]['opening'])
{
$lf = '';
}
$output[] = $lf.trim($element['content']);
}
else if ($element['text'])
{
// $output[] = "\n".str_repeat($indentWith, $indent).trim($element['content']);
$output[] = "\n".str_repeat($indentWith, $indent).preg_replace('/ [ \t]*/', ' ', $element['content']);
}
else if ($element['comment'])
{
$output[] = "\n".str_repeat($indentWith, $indent).trim($element['content']);
}
}
return trim(implode('', $output));
}
/**
* Parses an array of HTML tokens and adds basic information about about the type of
* tag the token represents.
*
* @param Array $elements Array of HTML tokens (tags and text tokens).
* @return Array HTML elements with extra information.
*/
public static function parseDom(Array $elements)
{
$dom = array();
foreach ($elements as $element)
{
$isText = false;
$isComment = false;
$isClosing = false;
$isOpening = false;
$isStandalone = false;
$currentElement = trim($element);
// comment
if (strpos($currentElement, '<!') === 0)
{
$isComment = true;
}
// closing tag
else if (strpos($currentElement, '</') === 0)
{
$isClosing = true;
}
// stand-alone tag
else if (preg_match('/\/>$/', $currentElement))
{
$isStandalone = true;
}
// normal opening tag
else if (strpos($currentElement, '<') === 0)
{
$isOpening = true;
}
// text
else
{
$isText = true;
}
$dom[] = array(
'text' => $isText,
'comment' => $isComment,
'closing' => $isClosing,
'opening' => $isOpening,
'standalone' => $isStandalone,
'content' => $element,
'type' => preg_replace('/^<\/?(\w+)[ >].*$/U', '$1', $element)
);
}
return $dom;
}
}
|