summaryrefslogtreecommitdiff
path: root/lib/htmlawed
diff options
context:
space:
mode:
authorpolo <ordipolo@gmx.fr>2022-02-17 18:13:00 +0100
committerpolo <ordipolo@gmx.fr>2022-02-17 18:13:00 +0100
commit787d03e48471ba62cd830379428f04d996f0b74b (patch)
treee9f98c7b9288c4530b50985688dd82622106ba2d /lib/htmlawed
parent29df6f1362745eabf4fbcaedf309eb63795152fa (diff)
downloadmelaine-787d03e48471ba62cd830379428f04d996f0b74b.zip
model update
Diffstat (limited to 'lib/htmlawed')
-rwxr-xr-xlib/htmlawed/htmLawed.php1458
-rwxr-xr-xlib/htmlawed/htmLawedTest.php1354
-rw-r--r--lib/htmlawed/htmLawed_README.htm4576
-rwxr-xr-xlib/htmlawed/htmLawed_README.txt3634
-rwxr-xr-xlib/htmlawed/htmLawed_TESTCASE.txt910
5 files changed, 5966 insertions, 5966 deletions
diff --git a/lib/htmlawed/htmLawed.php b/lib/htmlawed/htmLawed.php
index b384d98..a370d76 100755
--- a/lib/htmlawed/htmLawed.php
+++ b/lib/htmlawed/htmLawed.php
@@ -1,729 +1,729 @@
1<?php 1<?php
2 2
3/* 3/*
4htmLawed 1.2.5, 24 September 2019 4htmLawed 1.2.5, 24 September 2019
5Copyright Santosh Patnaik 5Copyright Santosh Patnaik
6Dual licensed with LGPL 3 and GPL 2+ 6Dual licensed with LGPL 3 and GPL 2+
7A PHP Labware internal utility - www.bioinformatics.org/phplabware/internal_utilities/htmLawed 7A PHP Labware internal utility - www.bioinformatics.org/phplabware/internal_utilities/htmLawed
8 8
9See htmLawed_README.txt/htm 9See htmLawed_README.txt/htm
10*/ 10*/
11 11
12function htmLawed($t, $C=1, $S=array()){ 12function htmLawed($t, $C=1, $S=array()){
13$C = is_array($C) ? $C : array(); 13$C = is_array($C) ? $C : array();
14if(!empty($C['valid_xhtml'])){ 14if(!empty($C['valid_xhtml'])){
15 $C['elements'] = empty($C['elements']) ? '*-acronym-big-center-dir-font-isindex-s-strike-tt' : $C['elements']; 15 $C['elements'] = empty($C['elements']) ? '*-acronym-big-center-dir-font-isindex-s-strike-tt' : $C['elements'];
16 $C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 2; 16 $C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 2;
17 $C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 2; 17 $C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 2;
18} 18}
19// config eles 19// config eles
20$e = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'dd'=>1, 'del'=>1, 'details'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'figcaption'=>1, 'figure'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'keygen'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'link'=>1, 'main'=>1, 'map'=>1, 'mark'=>1, 'menu'=>1, 'meta'=>1, 'meter'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'output'=>1, 'p'=>1, 'param'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'small'=>1, 'source'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'style'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'time'=>1, 'tr'=>1, 'track'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1); // 118 incl. deprecated & some Ruby 20$e = array('a'=>1, 'abbr'=>1, 'acronym'=>1, 'address'=>1, 'applet'=>1, 'area'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'blockquote'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'caption'=>1, 'center'=>1, 'cite'=>1, 'code'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'dd'=>1, 'del'=>1, 'details'=>1, 'dfn'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'dt'=>1, 'em'=>1, 'embed'=>1, 'fieldset'=>1, 'figcaption'=>1, 'figure'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'isindex'=>1, 'kbd'=>1, 'keygen'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'link'=>1, 'main'=>1, 'map'=>1, 'mark'=>1, 'menu'=>1, 'meta'=>1, 'meter'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'output'=>1, 'p'=>1, 'param'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'small'=>1, 'source'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'style'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'time'=>1, 'tr'=>1, 'track'=>1, 'tt'=>1, 'u'=>1, 'ul'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1); // 118 incl. deprecated & some Ruby
21 21
22if(!empty($C['safe'])){ 22if(!empty($C['safe'])){
23 unset($e['applet'], $e['audio'], $e['canvas'], $e['embed'], $e['iframe'], $e['object'], $e['script'], $e['video']); 23 unset($e['applet'], $e['audio'], $e['canvas'], $e['embed'], $e['iframe'], $e['object'], $e['script'], $e['video']);
24} 24}
25$x = !empty($C['elements']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['elements']) : '*'; 25$x = !empty($C['elements']) ? str_replace(array("\n", "\r", "\t", ' '), '', $C['elements']) : '*';
26if($x == '-*'){$e = array();} 26if($x == '-*'){$e = array();}
27elseif(strpos($x, '*') === false){$e = array_flip(explode(',', $x));} 27elseif(strpos($x, '*') === false){$e = array_flip(explode(',', $x));}
28else{ 28else{
29 if(isset($x[1])){ 29 if(isset($x[1])){
30 preg_match_all('`(?:^|-|\+)[^\-+]+?(?=-|\+|$)`', $x, $m, PREG_SET_ORDER); 30 preg_match_all('`(?:^|-|\+)[^\-+]+?(?=-|\+|$)`', $x, $m, PREG_SET_ORDER);
31 for($i=count($m); --$i>=0;){$m[$i] = $m[$i][0];} 31 for($i=count($m); --$i>=0;){$m[$i] = $m[$i][0];}
32 foreach($m as $v){ 32 foreach($m as $v){
33 if($v[0] == '+'){$e[substr($v, 1)] = 1;} 33 if($v[0] == '+'){$e[substr($v, 1)] = 1;}
34 if($v[0] == '-' && isset($e[($v = substr($v, 1))]) && !in_array('+'. $v, $m)){unset($e[$v]);} 34 if($v[0] == '-' && isset($e[($v = substr($v, 1))]) && !in_array('+'. $v, $m)){unset($e[$v]);}
35 } 35 }
36 } 36 }
37} 37}
38$C['elements'] =& $e; 38$C['elements'] =& $e;
39// config attrs 39// config attrs
40$x = !empty($C['deny_attribute']) ? strtolower(str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute'])) : ''; 40$x = !empty($C['deny_attribute']) ? strtolower(str_replace(array("\n", "\r", "\t", ' '), '', $C['deny_attribute'])) : '';
41$x = array_flip((isset($x[0]) && $x[0] == '*') ? str_replace('/', 'data-', explode('-', str_replace('data-', '/', $x))) : explode(',', $x. (!empty($C['safe']) ? ',on*' : ''))); 41$x = array_flip((isset($x[0]) && $x[0] == '*') ? str_replace('/', 'data-', explode('-', str_replace('data-', '/', $x))) : explode(',', $x. (!empty($C['safe']) ? ',on*' : '')));
42$C['deny_attribute'] = $x; 42$C['deny_attribute'] = $x;
43// config URLs 43// config URLs
44$x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet'. (empty($C['safe']) ? ', app, javascript; *: data, javascript, ' : '; *:'). 'file, http, https'; 44$x = (isset($C['schemes'][2]) && strpos($C['schemes'], ':')) ? strtolower($C['schemes']) : 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet'. (empty($C['safe']) ? ', app, javascript; *: data, javascript, ' : '; *:'). 'file, http, https';
45$C['schemes'] = array(); 45$C['schemes'] = array();
46foreach(explode(';', trim(str_replace(array(' ', "\t", "\r", "\n"), '', $x), ';')) as $v){ 46foreach(explode(';', trim(str_replace(array(' ', "\t", "\r", "\n"), '', $x), ';')) as $v){
47 $x = $x2 = null; list($x, $x2) = explode(':', $v, 2); 47 $x = $x2 = null; list($x, $x2) = explode(':', $v, 2);
48 if($x2){$C['schemes'][$x] = array_flip(explode(',', $x2));} 48 if($x2){$C['schemes'][$x] = array_flip(explode(',', $x2));}
49} 49}
50if(!isset($C['schemes']['*'])){ 50if(!isset($C['schemes']['*'])){
51 $C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1); 51 $C['schemes']['*'] = array('file'=>1, 'http'=>1, 'https'=>1);
52 if(empty($C['safe'])){$C['schemes']['*'] += array('data'=>1, 'javascript'=>1);} 52 if(empty($C['safe'])){$C['schemes']['*'] += array('data'=>1, 'javascript'=>1);}
53} 53}
54if(!empty($C['safe']) && empty($C['schemes']['style'])){$C['schemes']['style'] = array('!'=>1);} 54if(!empty($C['safe']) && empty($C['schemes']['style'])){$C['schemes']['style'] = array('!'=>1);}
55$C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0; 55$C['abs_url'] = isset($C['abs_url']) ? $C['abs_url'] : 0;
56if(!isset($C['base_url']) or !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])){ 56if(!isset($C['base_url']) or !preg_match('`^[a-zA-Z\d.+\-]+://[^/]+/(.+?/)?$`', $C['base_url'])){
57 $C['base_url'] = $C['abs_url'] = 0; 57 $C['base_url'] = $C['abs_url'] = 0;
58} 58}
59// config rest 59// config rest
60$C['and_mark'] = empty($C['and_mark']) ? 0 : 1; 60$C['and_mark'] = empty($C['and_mark']) ? 0 : 1;
61$C['anti_link_spam'] = (isset($C['anti_link_spam']) && is_array($C['anti_link_spam']) && count($C['anti_link_spam']) == 2 && (empty($C['anti_link_spam'][0]) or hl_regex($C['anti_link_spam'][0])) && (empty($C['anti_link_spam'][1]) or hl_regex($C['anti_link_spam'][1]))) ? $C['anti_link_spam'] : 0; 61$C['anti_link_spam'] = (isset($C['anti_link_spam']) && is_array($C['anti_link_spam']) && count($C['anti_link_spam']) == 2 && (empty($C['anti_link_spam'][0]) or hl_regex($C['anti_link_spam'][0])) && (empty($C['anti_link_spam'][1]) or hl_regex($C['anti_link_spam'][1]))) ? $C['anti_link_spam'] : 0;
62$C['anti_mail_spam'] = isset($C['anti_mail_spam']) ? $C['anti_mail_spam'] : 0; 62$C['anti_mail_spam'] = isset($C['anti_mail_spam']) ? $C['anti_mail_spam'] : 0;
63$C['balance'] = isset($C['balance']) ? (bool)$C['balance'] : 1; 63$C['balance'] = isset($C['balance']) ? (bool)$C['balance'] : 1;
64$C['cdata'] = isset($C['cdata']) ? $C['cdata'] : (empty($C['safe']) ? 3 : 0); 64$C['cdata'] = isset($C['cdata']) ? $C['cdata'] : (empty($C['safe']) ? 3 : 0);
65$C['clean_ms_char'] = empty($C['clean_ms_char']) ? 0 : $C['clean_ms_char']; 65$C['clean_ms_char'] = empty($C['clean_ms_char']) ? 0 : $C['clean_ms_char'];
66$C['comment'] = isset($C['comment']) ? $C['comment'] : (empty($C['safe']) ? 3 : 0); 66$C['comment'] = isset($C['comment']) ? $C['comment'] : (empty($C['safe']) ? 3 : 0);
67$C['css_expression'] = empty($C['css_expression']) ? 0 : 1; 67$C['css_expression'] = empty($C['css_expression']) ? 0 : 1;
68$C['direct_list_nest'] = empty($C['direct_list_nest']) ? 0 : 1; 68$C['direct_list_nest'] = empty($C['direct_list_nest']) ? 0 : 1;
69$C['hexdec_entity'] = isset($C['hexdec_entity']) ? $C['hexdec_entity'] : 1; 69$C['hexdec_entity'] = isset($C['hexdec_entity']) ? $C['hexdec_entity'] : 1;
70$C['hook'] = (!empty($C['hook']) && function_exists($C['hook'])) ? $C['hook'] : 0; 70$C['hook'] = (!empty($C['hook']) && function_exists($C['hook'])) ? $C['hook'] : 0;
71$C['hook_tag'] = (!empty($C['hook_tag']) && function_exists($C['hook_tag'])) ? $C['hook_tag'] : 0; 71$C['hook_tag'] = (!empty($C['hook_tag']) && function_exists($C['hook_tag'])) ? $C['hook_tag'] : 0;
72$C['keep_bad'] = isset($C['keep_bad']) ? $C['keep_bad'] : 6; 72$C['keep_bad'] = isset($C['keep_bad']) ? $C['keep_bad'] : 6;
73$C['lc_std_val'] = isset($C['lc_std_val']) ? (bool)$C['lc_std_val'] : 1; 73$C['lc_std_val'] = isset($C['lc_std_val']) ? (bool)$C['lc_std_val'] : 1;
74$C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 1; 74$C['make_tag_strict'] = isset($C['make_tag_strict']) ? $C['make_tag_strict'] : 1;
75$C['named_entity'] = isset($C['named_entity']) ? (bool)$C['named_entity'] : 1; 75$C['named_entity'] = isset($C['named_entity']) ? (bool)$C['named_entity'] : 1;
76$C['no_deprecated_attr'] = isset($C['no_deprecated_attr']) ? $C['no_deprecated_attr'] : 1; 76$C['no_deprecated_attr'] = isset($C['no_deprecated_attr']) ? $C['no_deprecated_attr'] : 1;
77$C['parent'] = isset($C['parent'][0]) ? strtolower($C['parent']) : 'body'; 77$C['parent'] = isset($C['parent'][0]) ? strtolower($C['parent']) : 'body';
78$C['show_setting'] = !empty($C['show_setting']) ? $C['show_setting'] : 0; 78$C['show_setting'] = !empty($C['show_setting']) ? $C['show_setting'] : 0;
79$C['style_pass'] = empty($C['style_pass']) ? 0 : 1; 79$C['style_pass'] = empty($C['style_pass']) ? 0 : 1;
80$C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy']; 80$C['tidy'] = empty($C['tidy']) ? 0 : $C['tidy'];
81$C['unique_ids'] = isset($C['unique_ids']) && (!preg_match('`\W`', $C['unique_ids'])) ? $C['unique_ids'] : 1; 81$C['unique_ids'] = isset($C['unique_ids']) && (!preg_match('`\W`', $C['unique_ids'])) ? $C['unique_ids'] : 1;
82$C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0; 82$C['xml:lang'] = isset($C['xml:lang']) ? $C['xml:lang'] : 0;
83 83
84if(isset($GLOBALS['C'])){$reC = $GLOBALS['C'];} 84if(isset($GLOBALS['C'])){$reC = $GLOBALS['C'];}
85$GLOBALS['C'] = $C; 85$GLOBALS['C'] = $C;
86$S = is_array($S) ? $S : hl_spec($S); 86$S = is_array($S) ? $S : hl_spec($S);
87if(isset($GLOBALS['S'])){$reS = $GLOBALS['S'];} 87if(isset($GLOBALS['S'])){$reS = $GLOBALS['S'];}
88$GLOBALS['S'] = $S; 88$GLOBALS['S'] = $S;
89 89
90$t = preg_replace('`[\x00-\x08\x0b-\x0c\x0e-\x1f]`', '', $t); 90$t = preg_replace('`[\x00-\x08\x0b-\x0c\x0e-\x1f]`', '', $t);
91if($C['clean_ms_char']){ 91if($C['clean_ms_char']){
92 $x = array("\x7f"=>'', "\x80"=>'&#8364;', "\x81"=>'', "\x83"=>'&#402;', "\x85"=>'&#8230;', "\x86"=>'&#8224;', "\x87"=>'&#8225;', "\x88"=>'&#710;', "\x89"=>'&#8240;', "\x8a"=>'&#352;', "\x8b"=>'&#8249;', "\x8c"=>'&#338;', "\x8d"=>'', "\x8e"=>'&#381;', "\x8f"=>'', "\x90"=>'', "\x95"=>'&#8226;', "\x96"=>'&#8211;', "\x97"=>'&#8212;', "\x98"=>'&#732;', "\x99"=>'&#8482;', "\x9a"=>'&#353;', "\x9b"=>'&#8250;', "\x9c"=>'&#339;', "\x9d"=>'', "\x9e"=>'&#382;', "\x9f"=>'&#376;'); 92 $x = array("\x7f"=>'', "\x80"=>'&#8364;', "\x81"=>'', "\x83"=>'&#402;', "\x85"=>'&#8230;', "\x86"=>'&#8224;', "\x87"=>'&#8225;', "\x88"=>'&#710;', "\x89"=>'&#8240;', "\x8a"=>'&#352;', "\x8b"=>'&#8249;', "\x8c"=>'&#338;', "\x8d"=>'', "\x8e"=>'&#381;', "\x8f"=>'', "\x90"=>'', "\x95"=>'&#8226;', "\x96"=>'&#8211;', "\x97"=>'&#8212;', "\x98"=>'&#732;', "\x99"=>'&#8482;', "\x9a"=>'&#353;', "\x9b"=>'&#8250;', "\x9c"=>'&#339;', "\x9d"=>'', "\x9e"=>'&#382;', "\x9f"=>'&#376;');
93 $x = $x + ($C['clean_ms_char'] == 1 ? array("\x82"=>'&#8218;', "\x84"=>'&#8222;', "\x91"=>'&#8216;', "\x92"=>'&#8217;', "\x93"=>'&#8220;', "\x94"=>'&#8221;') : array("\x82"=>'\'', "\x84"=>'"', "\x91"=>'\'', "\x92"=>'\'', "\x93"=>'"', "\x94"=>'"')); 93 $x = $x + ($C['clean_ms_char'] == 1 ? array("\x82"=>'&#8218;', "\x84"=>'&#8222;', "\x91"=>'&#8216;', "\x92"=>'&#8217;', "\x93"=>'&#8220;', "\x94"=>'&#8221;') : array("\x82"=>'\'', "\x84"=>'"', "\x91"=>'\'', "\x92"=>'\'', "\x93"=>'"', "\x94"=>'"'));
94 $t = strtr($t, $x); 94 $t = strtr($t, $x);
95} 95}
96if($C['cdata'] or $C['comment']){$t = preg_replace_callback('`<!(?:(?:--.*?--)|(?:\[CDATA\[.*?\]\]))>`sm', 'hl_cmtcd', $t);} 96if($C['cdata'] or $C['comment']){$t = preg_replace_callback('`<!(?:(?:--.*?--)|(?:\[CDATA\[.*?\]\]))>`sm', 'hl_cmtcd', $t);}
97$t = preg_replace_callback('`&amp;([a-zA-Z][a-zA-Z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'hl_ent', str_replace('&', '&amp;', $t)); 97$t = preg_replace_callback('`&amp;([a-zA-Z][a-zA-Z0-9]{1,30}|#(?:[0-9]{1,8}|[Xx][0-9A-Fa-f]{1,7}));`', 'hl_ent', str_replace('&', '&amp;', $t));
98if($C['unique_ids'] && !isset($GLOBALS['hl_Ids'])){$GLOBALS['hl_Ids'] = array();} 98if($C['unique_ids'] && !isset($GLOBALS['hl_Ids'])){$GLOBALS['hl_Ids'] = array();}
99if($C['hook']){$t = $C['hook']($t, $C, $S);} 99if($C['hook']){$t = $C['hook']($t, $C, $S);}
100if($C['show_setting'] && preg_match('`^[a-z][a-z0-9_]*$`i', $C['show_setting'])){ 100if($C['show_setting'] && preg_match('`^[a-z][a-z0-9_]*$`i', $C['show_setting'])){
101 $GLOBALS[$C['show_setting']] = array('config'=>$C, 'spec'=>$S, 'time'=>microtime()); 101 $GLOBALS[$C['show_setting']] = array('config'=>$C, 'spec'=>$S, 'time'=>microtime());
102} 102}
103// main 103// main
104$t = preg_replace_callback('`<(?:(?:\s|$)|(?:[^>]*(?:>|$)))|>`m', 'hl_tag', $t); 104$t = preg_replace_callback('`<(?:(?:\s|$)|(?:[^>]*(?:>|$)))|>`m', 'hl_tag', $t);
105$t = $C['balance'] ? hl_bal($t, $C['keep_bad'], $C['parent']) : $t; 105$t = $C['balance'] ? hl_bal($t, $C['keep_bad'], $C['parent']) : $t;
106$t = (($C['cdata'] or $C['comment']) && strpos($t, "\x01") !== false) ? str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05"), array('', '', '&', '<', '>'), $t) : $t; 106$t = (($C['cdata'] or $C['comment']) && strpos($t, "\x01") !== false) ? str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05"), array('', '', '&', '<', '>'), $t) : $t;
107$t = $C['tidy'] ? hl_tidy($t, $C['tidy'], $C['parent']) : $t; 107$t = $C['tidy'] ? hl_tidy($t, $C['tidy'], $C['parent']) : $t;
108unset($C, $e); 108unset($C, $e);
109if(isset($reC)){$GLOBALS['C'] = $reC;} 109if(isset($reC)){$GLOBALS['C'] = $reC;}
110if(isset($reS)){$GLOBALS['S'] = $reS;} 110if(isset($reS)){$GLOBALS['S'] = $reS;}
111return $t; 111return $t;
112} 112}
113 113
114function hl_attrval($a, $t, $p){ 114function hl_attrval($a, $t, $p){
115// check attr val against $S 115// check attr val against $S
116static $ma = array('accesskey', 'class', 'itemtype', 'rel'); 116static $ma = array('accesskey', 'class', 'itemtype', 'rel');
117$s = in_array($a, $ma) ? ' ' : ($a == 'srcset' ? ',': ''); 117$s = in_array($a, $ma) ? ' ' : ($a == 'srcset' ? ',': '');
118$r = array(); 118$r = array();
119$t = !empty($s) ? explode($s, $t) : array($t); 119$t = !empty($s) ? explode($s, $t) : array($t);
120foreach($t as $tk=>$tv){ 120foreach($t as $tk=>$tv){
121 $o = 1; $tv = trim($tv); $l = strlen($tv); 121 $o = 1; $tv = trim($tv); $l = strlen($tv);
122 foreach($p as $k=>$v){ 122 foreach($p as $k=>$v){
123 if(!$l){continue;} 123 if(!$l){continue;}
124 switch($k){ 124 switch($k){
125 case 'maxlen': if($l > $v){$o = 0;} 125 case 'maxlen': if($l > $v){$o = 0;}
126 break; case 'minlen': if($l < $v){$o = 0;} 126 break; case 'minlen': if($l < $v){$o = 0;}
127 break; case 'maxval': if((float)($tv) > $v){$o = 0;} 127 break; case 'maxval': if((float)($tv) > $v){$o = 0;}
128 break; case 'minval': if((float)($tv) < $v){$o = 0;} 128 break; case 'minval': if((float)($tv) < $v){$o = 0;}
129 break; case 'match': if(!preg_match($v, $tv)){$o = 0;} 129 break; case 'match': if(!preg_match($v, $tv)){$o = 0;}
130 break; case 'nomatch': if(preg_match($v, $tv)){$o = 0;} 130 break; case 'nomatch': if(preg_match($v, $tv)){$o = 0;}
131 break; case 'oneof': 131 break; case 'oneof':
132 $m = 0; 132 $m = 0;
133 foreach(explode('|', $v) as $n){if($tv == $n){$m = 1; break;}} 133 foreach(explode('|', $v) as $n){if($tv == $n){$m = 1; break;}}
134 $o = $m; 134 $o = $m;
135 break; case 'noneof': 135 break; case 'noneof':
136 $m = 1; 136 $m = 1;
137 foreach(explode('|', $v) as $n){if($tv == $n){$m = 0; break;}} 137 foreach(explode('|', $v) as $n){if($tv == $n){$m = 0; break;}}
138 $o = $m; 138 $o = $m;
139 break; default: 139 break; default:
140 break; 140 break;
141 } 141 }
142 if(!$o){break;} 142 if(!$o){break;}
143 } 143 }
144 if($o){$r[] = $tv;} 144 if($o){$r[] = $tv;}
145} 145}
146if($s == ','){$s = ', ';} 146if($s == ','){$s = ', ';}
147$r = implode($s, $r); 147$r = implode($s, $r);
148return (isset($r[0]) ? $r : (isset($p['default']) ? $p['default'] : 0)); 148return (isset($r[0]) ? $r : (isset($p['default']) ? $p['default'] : 0));
149} 149}
150 150
151function hl_bal($t, $do=1, $in='div'){ 151function hl_bal($t, $do=1, $in='div'){
152// balance tags 152// balance tags
153// by content 153// by content
154$cB = array('blockquote'=>1, 'form'=>1, 'map'=>1, 'noscript'=>1); // Block 154$cB = array('blockquote'=>1, 'form'=>1, 'map'=>1, 'noscript'=>1); // Block
155$cE = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); // Empty 155$cE = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); // Empty
156$cF = array('a'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'button'=>1, 'canvas'=>1, 'del'=>1, 'details'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'header'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'section'=>1, 'style'=>1, 'td'=>1, 'th'=>1, 'video'=>1); // Flow; later context-wise dynamic move of ins & del to $cI 156$cF = array('a'=>1, 'article'=>1, 'aside'=>1, 'audio'=>1, 'button'=>1, 'canvas'=>1, 'del'=>1, 'details'=>1, 'div'=>1, 'dd'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'header'=>1, 'iframe'=>1, 'ins'=>1, 'li'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'object'=>1, 'section'=>1, 'style'=>1, 'td'=>1, 'th'=>1, 'video'=>1); // Flow; later context-wise dynamic move of ins & del to $cI
157$cI = array('abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'data'=>1, 'datalist'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'figcaption'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hgroup'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'mark'=>1, 'meter'=>1, 'output'=>1, 'p'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); // Inline 157$cI = array('abbr'=>1, 'acronym'=>1, 'address'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'caption'=>1, 'cite'=>1, 'code'=>1, 'data'=>1, 'datalist'=>1, 'dfn'=>1, 'dt'=>1, 'em'=>1, 'figcaption'=>1, 'font'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hgroup'=>1, 'i'=>1, 'kbd'=>1, 'label'=>1, 'legend'=>1, 'mark'=>1, 'meter'=>1, 'output'=>1, 'p'=>1, 'pre'=>1, 'progress'=>1, 'q'=>1, 'rb'=>1, 'rt'=>1, 's'=>1, 'samp'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1); // Inline
158$cN = array('a'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'address'=>array('address'=>1, 'article'=>1, 'aside'=>1, 'header'=>1, 'keygen'=>1, 'footer'=>1, 'nav'=>1, 'section'=>1), 'button'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'footer'=>array('header'=>1, 'footer'=>1), 'form'=>array('form'=>1), 'header'=>array('header'=>1, 'footer'=>1), 'label'=>array('label'=>1), 'main'=>array('main'=>1), 'meter'=>array('meter'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'progress'=>array('progress'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1), 'time'=>array('time'=>1), ); // Illegal 158$cN = array('a'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'address'=>array('address'=>1, 'article'=>1, 'aside'=>1, 'header'=>1, 'keygen'=>1, 'footer'=>1, 'nav'=>1, 'section'=>1), 'button'=>array('a'=>1, 'address'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'select'=>1, 'textarea'=>1), 'fieldset'=>array('fieldset'=>1), 'footer'=>array('header'=>1, 'footer'=>1), 'form'=>array('form'=>1), 'header'=>array('header'=>1, 'footer'=>1), 'label'=>array('label'=>1), 'main'=>array('main'=>1), 'meter'=>array('meter'=>1), 'noscript'=>array('script'=>1), 'pre'=>array('big'=>1, 'font'=>1, 'img'=>1, 'object'=>1, 'script'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1), 'progress'=>array('progress'=>1), 'rb'=>array('ruby'=>1), 'rt'=>array('ruby'=>1), 'time'=>array('time'=>1), ); // Illegal
159$cN2 = array_keys($cN); 159$cN2 = array_keys($cN);
160$cS = array('colgroup'=>array('col'=>1), 'datalist'=>array('option'=>1), 'dir'=>array('li'=>1), 'dl'=>array('dd'=>1, 'dt'=>1), 'hgroup'=>array('h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child 160$cS = array('colgroup'=>array('col'=>1), 'datalist'=>array('option'=>1), 'dir'=>array('li'=>1), 'dl'=>array('dd'=>1, 'dt'=>1), 'hgroup'=>array('h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1), 'menu'=>array('li'=>1), 'ol'=>array('li'=>1), 'optgroup'=>array('option'=>1), 'option'=>array('#pcdata'=>1), 'rbc'=>array('rb'=>1), 'rp'=>array('#pcdata'=>1), 'rtc'=>array('rt'=>1), 'ruby'=>array('rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1), 'select'=>array('optgroup'=>1, 'option'=>1), 'script'=>array('#pcdata'=>1), 'table'=>array('caption'=>1, 'col'=>1, 'colgroup'=>1, 'tfoot'=>1, 'tbody'=>1, 'tr'=>1, 'thead'=>1), 'tbody'=>array('tr'=>1), 'tfoot'=>array('tr'=>1), 'textarea'=>array('#pcdata'=>1), 'thead'=>array('tr'=>1), 'tr'=>array('td'=>1, 'th'=>1), 'ul'=>array('li'=>1)); // Specific - immediate parent-child
161if($GLOBALS['C']['direct_list_nest']){$cS['ol'] = $cS['ul'] = $cS['menu'] += array('menu'=>1, 'ol'=>1, 'ul'=>1);} 161if($GLOBALS['C']['direct_list_nest']){$cS['ol'] = $cS['ul'] = $cS['menu'] += array('menu'=>1, 'ol'=>1, 'ul'=>1);}
162$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'audio'=>array('source'=>1, 'track'=>1), 'blockquote'=>array('script'=>1), 'details'=>array('summary'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'figure'=>array('figcaption'=>1),'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1), 'video'=>array('source'=>1, 'track'=>1)); // Other 162$cO = array('address'=>array('p'=>1), 'applet'=>array('param'=>1), 'audio'=>array('source'=>1, 'track'=>1), 'blockquote'=>array('script'=>1), 'details'=>array('summary'=>1), 'fieldset'=>array('legend'=>1, '#pcdata'=>1), 'figure'=>array('figcaption'=>1),'form'=>array('script'=>1), 'map'=>array('area'=>1), 'object'=>array('param'=>1, 'embed'=>1), 'video'=>array('source'=>1, 'track'=>1)); // Other
163$cT = array('colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'option'=>1, 'p'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1); // Omitable closing 163$cT = array('colgroup'=>1, 'dd'=>1, 'dt'=>1, 'li'=>1, 'option'=>1, 'p'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1); // Omitable closing
164// block/inline type; a/ins/del both type; #pcdata: text 164// block/inline type; a/ins/del both type; #pcdata: text
165$eB = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'details'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hr'=>1, 'isindex'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'section'=>1, 'style'=>1, 'table'=>1, 'ul'=>1); 165$eB = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'del'=>1, 'details'=>1, 'dir'=>1, 'dl'=>1, 'div'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'ins'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'header'=>1, 'hr'=>1, 'isindex'=>1, 'main'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'section'=>1, 'style'=>1, 'table'=>1, 'ul'=>1);
166$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'cite'=>1, 'code'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'figcaption'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'link'=>1, 'map'=>1, 'mark'=>1, 'meta'=>1, 'meter'=>1, 'object'=>1, 'output'=>1, 'progress'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1); 166$eI = array('#pcdata'=>1, 'a'=>1, 'abbr'=>1, 'acronym'=>1, 'applet'=>1, 'audio'=>1, 'b'=>1, 'bdi'=>1, 'bdo'=>1, 'big'=>1, 'br'=>1, 'button'=>1, 'canvas'=>1, 'cite'=>1, 'code'=>1, 'command'=>1, 'data'=>1, 'datalist'=>1, 'del'=>1, 'dfn'=>1, 'em'=>1, 'embed'=>1, 'figcaption'=>1, 'font'=>1, 'i'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'ins'=>1, 'kbd'=>1, 'label'=>1, 'link'=>1, 'map'=>1, 'mark'=>1, 'meta'=>1, 'meter'=>1, 'object'=>1, 'output'=>1, 'progress'=>1, 'q'=>1, 'ruby'=>1, 's'=>1, 'samp'=>1, 'select'=>1, 'script'=>1, 'small'=>1, 'span'=>1, 'strike'=>1, 'strong'=>1, 'sub'=>1, 'summary'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1, 'tt'=>1, 'u'=>1, 'var'=>1, 'video'=>1, 'wbr'=>1);
167$eN = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'big'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'fieldset'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'meter'=>1, 'nav'=>1, 'object'=>1, 'progress'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1); // Exclude from specific ele; $cN values 167$eN = array('a'=>1, 'address'=>1, 'article'=>1, 'aside'=>1, 'big'=>1, 'button'=>1, 'details'=>1, 'embed'=>1, 'fieldset'=>1, 'font'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'meter'=>1, 'nav'=>1, 'object'=>1, 'progress'=>1, 'ruby'=>1, 'script'=>1, 'select'=>1, 'small'=>1, 'sub'=>1, 'sup'=>1, 'textarea'=>1, 'time'=>1); // Exclude from specific ele; $cN values
168$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'dd'=>1, 'dt'=>1, 'hgroup'=>1, 'keygen'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'source'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1, 'track'=>1); // Missing in $eB & $eI 168$eO = array('area'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'command'=>1, 'dd'=>1, 'dt'=>1, 'hgroup'=>1, 'keygen'=>1, 'legend'=>1, 'li'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'rb'=>1, 'rbc'=>1, 'rp'=>1, 'rt'=>1, 'rtc'=>1, 'script'=>1, 'source'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'thead'=>1, 'th'=>1, 'tr'=>1, 'track'=>1); // Missing in $eB & $eI
169$eF = $eB + $eI; 169$eF = $eB + $eI;
170 170
171// $in sets allowed child 171// $in sets allowed child
172$in = ((isset($eF[$in]) && $in != '#pcdata') or isset($eO[$in])) ? $in : 'div'; 172$in = ((isset($eF[$in]) && $in != '#pcdata') or isset($eO[$in])) ? $in : 'div';
173if(isset($cE[$in])){ 173if(isset($cE[$in])){
174 return (!$do ? '' : str_replace(array('<', '>'), array('&lt;', '&gt;'), $t)); 174 return (!$do ? '' : str_replace(array('<', '>'), array('&lt;', '&gt;'), $t));
175} 175}
176if(isset($cS[$in])){$inOk = $cS[$in];} 176if(isset($cS[$in])){$inOk = $cS[$in];}
177elseif(isset($cI[$in])){$inOk = $eI; $cI['del'] = 1; $cI['ins'] = 1;} 177elseif(isset($cI[$in])){$inOk = $eI; $cI['del'] = 1; $cI['ins'] = 1;}
178elseif(isset($cF[$in])){$inOk = $eF; unset($cI['del'], $cI['ins']);} 178elseif(isset($cF[$in])){$inOk = $eF; unset($cI['del'], $cI['ins']);}
179elseif(isset($cB[$in])){$inOk = $eB; unset($cI['del'], $cI['ins']);} 179elseif(isset($cB[$in])){$inOk = $eB; unset($cI['del'], $cI['ins']);}
180if(isset($cO[$in])){$inOk = $inOk + $cO[$in];} 180if(isset($cO[$in])){$inOk = $inOk + $cO[$in];}
181if(isset($cN[$in])){$inOk = array_diff_assoc($inOk, $cN[$in]);} 181if(isset($cN[$in])){$inOk = array_diff_assoc($inOk, $cN[$in]);}
182 182
183$t = explode('<', $t); 183$t = explode('<', $t);
184$ok = $q = array(); // $q seq list of open non-empty ele 184$ok = $q = array(); // $q seq list of open non-empty ele
185ob_start(); 185ob_start();
186 186
187for($i=-1, $ci=count($t); ++$i<$ci;){ 187for($i=-1, $ci=count($t); ++$i<$ci;){
188 // allowed $ok in parent $p 188 // allowed $ok in parent $p
189 if($ql = count($q)){ 189 if($ql = count($q)){
190 $p = array_pop($q); 190 $p = array_pop($q);
191 $q[] = $p; 191 $q[] = $p;
192 if(isset($cS[$p])){$ok = $cS[$p];} 192 if(isset($cS[$p])){$ok = $cS[$p];}
193 elseif(isset($cI[$p])){$ok = $eI; $cI['del'] = 1; $cI['ins'] = 1;} 193 elseif(isset($cI[$p])){$ok = $eI; $cI['del'] = 1; $cI['ins'] = 1;}
194 elseif(isset($cF[$p])){$ok = $eF; unset($cI['del'], $cI['ins']);} 194 elseif(isset($cF[$p])){$ok = $eF; unset($cI['del'], $cI['ins']);}
195 elseif(isset($cB[$p])){$ok = $eB; unset($cI['del'], $cI['ins']);} 195 elseif(isset($cB[$p])){$ok = $eB; unset($cI['del'], $cI['ins']);}
196 if(isset($cO[$p])){$ok = $ok + $cO[$p];} 196 if(isset($cO[$p])){$ok = $ok + $cO[$p];}
197 if(isset($cN[$p])){$ok = array_diff_assoc($ok, $cN[$p]);} 197 if(isset($cN[$p])){$ok = array_diff_assoc($ok, $cN[$p]);}
198 }else{$ok = $inOk; unset($cI['del'], $cI['ins']);} 198 }else{$ok = $inOk; unset($cI['del'], $cI['ins']);}
199 // bad tags, & ele content 199 // bad tags, & ele content
200 if(isset($e) && ($do == 1 or (isset($ok['#pcdata']) && ($do == 3 or $do == 5)))){ 200 if(isset($e) && ($do == 1 or (isset($ok['#pcdata']) && ($do == 3 or $do == 5)))){
201 echo '&lt;', $s, $e, $a, '&gt;'; 201 echo '&lt;', $s, $e, $a, '&gt;';
202 } 202 }
203 if(isset($x[0])){ 203 if(isset($x[0])){
204 if(strlen(trim($x)) && (($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql))){ 204 if(strlen(trim($x)) && (($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql))){
205 echo '<div>', $x, '</div>'; 205 echo '<div>', $x, '</div>';
206 } 206 }
207 elseif($do < 3 or isset($ok['#pcdata'])){echo $x;} 207 elseif($do < 3 or isset($ok['#pcdata'])){echo $x;}
208 elseif(strpos($x, "\x02\x04")){ 208 elseif(strpos($x, "\x02\x04")){
209 foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){ 209 foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){
210 echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : '')); 210 echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : ''));
211 } 211 }
212 }elseif($do > 4){echo preg_replace('`\S`', '', $x);} 212 }elseif($do > 4){echo preg_replace('`\S`', '', $x);}
213 } 213 }
214 // get markup 214 // get markup
215 if(!preg_match('`^(/?)([a-z1-6]+)([^>]*)>(.*)`sm', $t[$i], $r)){$x = $t[$i]; continue;} 215 if(!preg_match('`^(/?)([a-z1-6]+)([^>]*)>(.*)`sm', $t[$i], $r)){$x = $t[$i]; continue;}
216 $s = null; $e = null; $a = null; $x = null; list($all, $s, $e, $a, $x) = $r; 216 $s = null; $e = null; $a = null; $x = null; list($all, $s, $e, $a, $x) = $r;
217 // close tag 217 // close tag
218 if($s){ 218 if($s){
219 if(isset($cE[$e]) or !in_array($e, $q)){continue;} // Empty/unopen 219 if(isset($cE[$e]) or !in_array($e, $q)){continue;} // Empty/unopen
220 if($p == $e){array_pop($q); echo '</', $e, '>'; unset($e); continue;} // Last open 220 if($p == $e){array_pop($q); echo '</', $e, '>'; unset($e); continue;} // Last open
221 $add = ''; // Nesting - close open tags that need to be 221 $add = ''; // Nesting - close open tags that need to be
222 for($j=-1, $cj=count($q); ++$j<$cj;){ 222 for($j=-1, $cj=count($q); ++$j<$cj;){
223 if(($d = array_pop($q)) == $e){break;} 223 if(($d = array_pop($q)) == $e){break;}
224 else{$add .= "</{$d}>";} 224 else{$add .= "</{$d}>";}
225 } 225 }
226 echo $add, '</', $e, '>'; unset($e); continue; 226 echo $add, '</', $e, '>'; unset($e); continue;
227 } 227 }
228 // open tag 228 // open tag
229 // $cB ele needs $eB ele as child 229 // $cB ele needs $eB ele as child
230 if(isset($cB[$e]) && strlen(trim($x))){ 230 if(isset($cB[$e]) && strlen(trim($x))){
231 $t[$i] = "{$e}{$a}>"; 231 $t[$i] = "{$e}{$a}>";
232 array_splice($t, $i+1, 0, 'div>'. $x); unset($e, $x); ++$ci; --$i; continue; 232 array_splice($t, $i+1, 0, 'div>'. $x); unset($e, $x); ++$ci; --$i; continue;
233 } 233 }
234 if((($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql)) && !isset($eB[$e]) && !isset($ok[$e])){ 234 if((($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql)) && !isset($eB[$e]) && !isset($ok[$e])){
235 array_splice($t, $i, 0, 'div>'); unset($e, $x); ++$ci; --$i; continue; 235 array_splice($t, $i, 0, 'div>'); unset($e, $x); ++$ci; --$i; continue;
236 } 236 }
237 // if no open ele, $in = parent; mostly immediate parent-child relation should hold 237 // if no open ele, $in = parent; mostly immediate parent-child relation should hold
238 if(!$ql or !isset($eN[$e]) or !array_intersect($q, $cN2)){ 238 if(!$ql or !isset($eN[$e]) or !array_intersect($q, $cN2)){
239 if(!isset($ok[$e])){ 239 if(!isset($ok[$e])){
240 if($ql && isset($cT[$p])){echo '</', array_pop($q), '>'; unset($e, $x); --$i;} 240 if($ql && isset($cT[$p])){echo '</', array_pop($q), '>'; unset($e, $x); --$i;}
241 continue; 241 continue;
242 } 242 }
243 if(!isset($cE[$e])){$q[] = $e;} 243 if(!isset($cE[$e])){$q[] = $e;}
244 echo '<', $e, $a, '>'; unset($e); continue; 244 echo '<', $e, $a, '>'; unset($e); continue;
245 } 245 }
246 // specific parent-child 246 // specific parent-child
247 if(isset($cS[$p][$e])){ 247 if(isset($cS[$p][$e])){
248 if(!isset($cE[$e])){$q[] = $e;} 248 if(!isset($cE[$e])){$q[] = $e;}
249 echo '<', $e, $a, '>'; unset($e); continue; 249 echo '<', $e, $a, '>'; unset($e); continue;
250 } 250 }
251 // nesting 251 // nesting
252 $add = ''; 252 $add = '';
253 $q2 = array(); 253 $q2 = array();
254 for($k=-1, $kc=count($q); ++$k<$kc;){ 254 for($k=-1, $kc=count($q); ++$k<$kc;){
255 $d = $q[$k]; 255 $d = $q[$k];
256 $ok2 = array(); 256 $ok2 = array();
257 if(isset($cS[$d])){$q2[] = $d; continue;} 257 if(isset($cS[$d])){$q2[] = $d; continue;}
258 $ok2 = isset($cI[$d]) ? $eI : $eF; 258 $ok2 = isset($cI[$d]) ? $eI : $eF;
259 if(isset($cO[$d])){$ok2 = $ok2 + $cO[$d];} 259 if(isset($cO[$d])){$ok2 = $ok2 + $cO[$d];}
260 if(isset($cN[$d])){$ok2 = array_diff_assoc($ok2, $cN[$d]);} 260 if(isset($cN[$d])){$ok2 = array_diff_assoc($ok2, $cN[$d]);}
261 if(!isset($ok2[$e])){ 261 if(!isset($ok2[$e])){
262 if(!$k && !isset($inOk[$e])){continue 2;} 262 if(!$k && !isset($inOk[$e])){continue 2;}
263 $add = "</{$d}>"; 263 $add = "</{$d}>";
264 for(;++$k<$kc;){$add = "</{$q[$k]}>{$add}";} 264 for(;++$k<$kc;){$add = "</{$q[$k]}>{$add}";}
265 break; 265 break;
266 } 266 }
267 else{$q2[] = $d;} 267 else{$q2[] = $d;}
268 } 268 }
269 $q = $q2; 269 $q = $q2;
270 if(!isset($cE[$e])){$q[] = $e;} 270 if(!isset($cE[$e])){$q[] = $e;}
271 echo $add, '<', $e, $a, '>'; unset($e); continue; 271 echo $add, '<', $e, $a, '>'; unset($e); continue;
272} 272}
273 273
274// end 274// end
275if($ql = count($q)){ 275if($ql = count($q)){
276 $p = array_pop($q); 276 $p = array_pop($q);
277 $q[] = $p; 277 $q[] = $p;
278 if(isset($cS[$p])){$ok = $cS[$p];} 278 if(isset($cS[$p])){$ok = $cS[$p];}
279 elseif(isset($cI[$p])){$ok = $eI; $cI['del'] = 1; $cI['ins'] = 1;} 279 elseif(isset($cI[$p])){$ok = $eI; $cI['del'] = 1; $cI['ins'] = 1;}
280 elseif(isset($cF[$p])){$ok = $eF; unset($cI['del'], $cI['ins']);} 280 elseif(isset($cF[$p])){$ok = $eF; unset($cI['del'], $cI['ins']);}
281 elseif(isset($cB[$p])){$ok = $eB; unset($cI['del'], $cI['ins']);} 281 elseif(isset($cB[$p])){$ok = $eB; unset($cI['del'], $cI['ins']);}
282 if(isset($cO[$p])){$ok = $ok + $cO[$p];} 282 if(isset($cO[$p])){$ok = $ok + $cO[$p];}
283 if(isset($cN[$p])){$ok = array_diff_assoc($ok, $cN[$p]);} 283 if(isset($cN[$p])){$ok = array_diff_assoc($ok, $cN[$p]);}
284}else{$ok = $inOk; unset($cI['del'], $cI['ins']);} 284}else{$ok = $inOk; unset($cI['del'], $cI['ins']);}
285if(isset($e) && ($do == 1 or (isset($ok['#pcdata']) && ($do == 3 or $do == 5)))){ 285if(isset($e) && ($do == 1 or (isset($ok['#pcdata']) && ($do == 3 or $do == 5)))){
286 echo '&lt;', $s, $e, $a, '&gt;'; 286 echo '&lt;', $s, $e, $a, '&gt;';
287} 287}
288if(isset($x[0])){ 288if(isset($x[0])){
289 if(strlen(trim($x)) && (($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql))){ 289 if(strlen(trim($x)) && (($ql && isset($cB[$p])) or (isset($cB[$in]) && !$ql))){
290 echo '<div>', $x, '</div>'; 290 echo '<div>', $x, '</div>';
291 } 291 }
292 elseif($do < 3 or isset($ok['#pcdata'])){echo $x;} 292 elseif($do < 3 or isset($ok['#pcdata'])){echo $x;}
293 elseif(strpos($x, "\x02\x04")){ 293 elseif(strpos($x, "\x02\x04")){
294 foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){ 294 foreach(preg_split('`(\x01\x02[^\x01\x02]+\x02\x01)`', $x, -1, PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_NO_EMPTY) as $v){
295 echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : '')); 295 echo (substr($v, 0, 2) == "\x01\x02" ? $v : ($do > 4 ? preg_replace('`\S`', '', $v) : ''));
296 } 296 }
297 }elseif($do > 4){echo preg_replace('`\S`', '', $x);} 297 }elseif($do > 4){echo preg_replace('`\S`', '', $x);}
298} 298}
299while(!empty($q) && ($e = array_pop($q))){echo '</', $e, '>';} 299while(!empty($q) && ($e = array_pop($q))){echo '</', $e, '>';}
300$o = ob_get_contents(); 300$o = ob_get_contents();
301ob_end_clean(); 301ob_end_clean();
302return $o; 302return $o;
303} 303}
304 304
305function hl_cmtcd($t){ 305function hl_cmtcd($t){
306// comment/CDATA sec handler 306// comment/CDATA sec handler
307$t = $t[0]; 307$t = $t[0];
308global $C; 308global $C;
309if(!($v = $C[$n = $t[3] == '-' ? 'comment' : 'cdata'])){return $t;} 309if(!($v = $C[$n = $t[3] == '-' ? 'comment' : 'cdata'])){return $t;}
310if($v == 1){return '';} 310if($v == 1){return '';}
311if($n == 'comment' && $v < 4){ 311if($n == 'comment' && $v < 4){
312 if(substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1) != ' '){$t .= ' ';} 312 if(substr(($t = preg_replace('`--+`', '-', substr($t, 4, -3))), -1) != ' '){$t .= ' ';}
313} 313}
314else{$t = substr($t, 1, -1);} 314else{$t = substr($t, 1, -1);}
315$t = $v == 2 ? str_replace(array('&', '<', '>'), array('&amp;', '&lt;', '&gt;'), $t) : $t; 315$t = $v == 2 ? str_replace(array('&', '<', '>'), array('&amp;', '&lt;', '&gt;'), $t) : $t;
316return str_replace(array('&', '<', '>'), array("\x03", "\x04", "\x05"), ($n == 'comment' ? "\x01\x02\x04!--$t--\x05\x02\x01" : "\x01\x01\x04$t\x05\x01\x01")); 316return str_replace(array('&', '<', '>'), array("\x03", "\x04", "\x05"), ($n == 'comment' ? "\x01\x02\x04!--$t--\x05\x02\x01" : "\x01\x01\x04$t\x05\x01\x01"));
317} 317}
318 318
319function hl_ent($t){ 319function hl_ent($t){
320// entitity handler 320// entitity handler
321global $C; 321global $C;
322$t = $t[1]; 322$t = $t[1];
323static $U = array('quot'=>1,'amp'=>1,'lt'=>1,'gt'=>1); 323static $U = array('quot'=>1,'amp'=>1,'lt'=>1,'gt'=>1);
324static $N = array('fnof'=>'402', 'Alpha'=>'913', 'Beta'=>'914', 'Gamma'=>'915', 'Delta'=>'916', 'Epsilon'=>'917', 'Zeta'=>'918', 'Eta'=>'919', 'Theta'=>'920', 'Iota'=>'921', 'Kappa'=>'922', 'Lambda'=>'923', 'Mu'=>'924', 'Nu'=>'925', 'Xi'=>'926', 'Omicron'=>'927', 'Pi'=>'928', 'Rho'=>'929', 'Sigma'=>'931', 'Tau'=>'932', 'Upsilon'=>'933', 'Phi'=>'934', 'Chi'=>'935', 'Psi'=>'936', 'Omega'=>'937', 'alpha'=>'945', 'beta'=>'946', 'gamma'=>'947', 'delta'=>'948', 'epsilon'=>'949', 'zeta'=>'950', 'eta'=>'951', 'theta'=>'952', 'iota'=>'953', 'kappa'=>'954', 'lambda'=>'955', 'mu'=>'956', 'nu'=>'957', 'xi'=>'958', 'omicron'=>'959', 'pi'=>'960', 'rho'=>'961', 'sigmaf'=>'962', 'sigma'=>'963', 'tau'=>'964', 'upsilon'=>'965', 'phi'=>'966', 'chi'=>'967', 'psi'=>'968', 'omega'=>'969', 'thetasym'=>'977', 'upsih'=>'978', 'piv'=>'982', 'bull'=>'8226', 'hellip'=>'8230', 'prime'=>'8242', 'Prime'=>'8243', 'oline'=>'8254', 'frasl'=>'8260', 'weierp'=>'8472', 'image'=>'8465', 'real'=>'8476', 'trade'=>'8482', 'alefsym'=>'8501', 'larr'=>'8592', 'uarr'=>'8593', 'rarr'=>'8594', 'darr'=>'8595', 'harr'=>'8596', 'crarr'=>'8629', 'lArr'=>'8656', 'uArr'=>'8657', 'rArr'=>'8658', 'dArr'=>'8659', 'hArr'=>'8660', 'forall'=>'8704', 'part'=>'8706', 'exist'=>'8707', 'empty'=>'8709', 'nabla'=>'8711', 'isin'=>'8712', 'notin'=>'8713', 'ni'=>'8715', 'prod'=>'8719', 'sum'=>'8721', 'minus'=>'8722', 'lowast'=>'8727', 'radic'=>'8730', 'prop'=>'8733', 'infin'=>'8734', 'ang'=>'8736', 'and'=>'8743', 'or'=>'8744', 'cap'=>'8745', 'cup'=>'8746', 'int'=>'8747', 'there4'=>'8756', 'sim'=>'8764', 'cong'=>'8773', 'asymp'=>'8776', 'ne'=>'8800', 'equiv'=>'8801', 'le'=>'8804', 'ge'=>'8805', 'sub'=>'8834', 'sup'=>'8835', 'nsub'=>'8836', 'sube'=>'8838', 'supe'=>'8839', 'oplus'=>'8853', 'otimes'=>'8855', 'perp'=>'8869', 'sdot'=>'8901', 'lceil'=>'8968', 'rceil'=>'8969', 'lfloor'=>'8970', 'rfloor'=>'8971', 'lang'=>'9001', 'rang'=>'9002', 'loz'=>'9674', 'spades'=>'9824', 'clubs'=>'9827', 'hearts'=>'9829', 'diams'=>'9830', 'apos'=>'39', 'OElig'=>'338', 'oelig'=>'339', 'Scaron'=>'352', 'scaron'=>'353', 'Yuml'=>'376', 'circ'=>'710', 'tilde'=>'732', 'ensp'=>'8194', 'emsp'=>'8195', 'thinsp'=>'8201', 'zwnj'=>'8204', 'zwj'=>'8205', 'lrm'=>'8206', 'rlm'=>'8207', 'ndash'=>'8211', 'mdash'=>'8212', 'lsquo'=>'8216', 'rsquo'=>'8217', 'sbquo'=>'8218', 'ldquo'=>'8220', 'rdquo'=>'8221', 'bdquo'=>'8222', 'dagger'=>'8224', 'Dagger'=>'8225', 'permil'=>'8240', 'lsaquo'=>'8249', 'rsaquo'=>'8250', 'euro'=>'8364', 'nbsp'=>'160', 'iexcl'=>'161', 'cent'=>'162', 'pound'=>'163', 'curren'=>'164', 'yen'=>'165', 'brvbar'=>'166', 'sect'=>'167', 'uml'=>'168', 'copy'=>'169', 'ordf'=>'170', 'laquo'=>'171', 'not'=>'172', 'shy'=>'173', 'reg'=>'174', 'macr'=>'175', 'deg'=>'176', 'plusmn'=>'177', 'sup2'=>'178', 'sup3'=>'179', 'acute'=>'180', 'micro'=>'181', 'para'=>'182', 'middot'=>'183', 'cedil'=>'184', 'sup1'=>'185', 'ordm'=>'186', 'raquo'=>'187', 'frac14'=>'188', 'frac12'=>'189', 'frac34'=>'190', 'iquest'=>'191', 'Agrave'=>'192', 'Aacute'=>'193', 'Acirc'=>'194', 'Atilde'=>'195', 'Auml'=>'196', 'Aring'=>'197', 'AElig'=>'198', 'Ccedil'=>'199', 'Egrave'=>'200', 'Eacute'=>'201', 'Ecirc'=>'202', 'Euml'=>'203', 'Igrave'=>'204', 'Iacute'=>'205', 'Icirc'=>'206', 'Iuml'=>'207', 'ETH'=>'208', 'Ntilde'=>'209', 'Ograve'=>'210', 'Oacute'=>'211', 'Ocirc'=>'212', 'Otilde'=>'213', 'Ouml'=>'214', 'times'=>'215', 'Oslash'=>'216', 'Ugrave'=>'217', 'Uacute'=>'218', 'Ucirc'=>'219', 'Uuml'=>'220', 'Yacute'=>'221', 'THORN'=>'222', 'szlig'=>'223', 'agrave'=>'224', 'aacute'=>'225', 'acirc'=>'226', 'atilde'=>'227', 'auml'=>'228', 'aring'=>'229', 'aelig'=>'230', 'ccedil'=>'231', 'egrave'=>'232', 'eacute'=>'233', 'ecirc'=>'234', 'euml'=>'235', 'igrave'=>'236', 'iacute'=>'237', 'icirc'=>'238', 'iuml'=>'239', 'eth'=>'240', 'ntilde'=>'241', 'ograve'=>'242', 'oacute'=>'243', 'ocirc'=>'244', 'otilde'=>'245', 'ouml'=>'246', 'divide'=>'247', 'oslash'=>'248', 'ugrave'=>'249', 'uacute'=>'250', 'ucirc'=>'251', 'uuml'=>'252', 'yacute'=>'253', 'thorn'=>'254', 'yuml'=>'255'); 324static $N = array('fnof'=>'402', 'Alpha'=>'913', 'Beta'=>'914', 'Gamma'=>'915', 'Delta'=>'916', 'Epsilon'=>'917', 'Zeta'=>'918', 'Eta'=>'919', 'Theta'=>'920', 'Iota'=>'921', 'Kappa'=>'922', 'Lambda'=>'923', 'Mu'=>'924', 'Nu'=>'925', 'Xi'=>'926', 'Omicron'=>'927', 'Pi'=>'928', 'Rho'=>'929', 'Sigma'=>'931', 'Tau'=>'932', 'Upsilon'=>'933', 'Phi'=>'934', 'Chi'=>'935', 'Psi'=>'936', 'Omega'=>'937', 'alpha'=>'945', 'beta'=>'946', 'gamma'=>'947', 'delta'=>'948', 'epsilon'=>'949', 'zeta'=>'950', 'eta'=>'951', 'theta'=>'952', 'iota'=>'953', 'kappa'=>'954', 'lambda'=>'955', 'mu'=>'956', 'nu'=>'957', 'xi'=>'958', 'omicron'=>'959', 'pi'=>'960', 'rho'=>'961', 'sigmaf'=>'962', 'sigma'=>'963', 'tau'=>'964', 'upsilon'=>'965', 'phi'=>'966', 'chi'=>'967', 'psi'=>'968', 'omega'=>'969', 'thetasym'=>'977', 'upsih'=>'978', 'piv'=>'982', 'bull'=>'8226', 'hellip'=>'8230', 'prime'=>'8242', 'Prime'=>'8243', 'oline'=>'8254', 'frasl'=>'8260', 'weierp'=>'8472', 'image'=>'8465', 'real'=>'8476', 'trade'=>'8482', 'alefsym'=>'8501', 'larr'=>'8592', 'uarr'=>'8593', 'rarr'=>'8594', 'darr'=>'8595', 'harr'=>'8596', 'crarr'=>'8629', 'lArr'=>'8656', 'uArr'=>'8657', 'rArr'=>'8658', 'dArr'=>'8659', 'hArr'=>'8660', 'forall'=>'8704', 'part'=>'8706', 'exist'=>'8707', 'empty'=>'8709', 'nabla'=>'8711', 'isin'=>'8712', 'notin'=>'8713', 'ni'=>'8715', 'prod'=>'8719', 'sum'=>'8721', 'minus'=>'8722', 'lowast'=>'8727', 'radic'=>'8730', 'prop'=>'8733', 'infin'=>'8734', 'ang'=>'8736', 'and'=>'8743', 'or'=>'8744', 'cap'=>'8745', 'cup'=>'8746', 'int'=>'8747', 'there4'=>'8756', 'sim'=>'8764', 'cong'=>'8773', 'asymp'=>'8776', 'ne'=>'8800', 'equiv'=>'8801', 'le'=>'8804', 'ge'=>'8805', 'sub'=>'8834', 'sup'=>'8835', 'nsub'=>'8836', 'sube'=>'8838', 'supe'=>'8839', 'oplus'=>'8853', 'otimes'=>'8855', 'perp'=>'8869', 'sdot'=>'8901', 'lceil'=>'8968', 'rceil'=>'8969', 'lfloor'=>'8970', 'rfloor'=>'8971', 'lang'=>'9001', 'rang'=>'9002', 'loz'=>'9674', 'spades'=>'9824', 'clubs'=>'9827', 'hearts'=>'9829', 'diams'=>'9830', 'apos'=>'39', 'OElig'=>'338', 'oelig'=>'339', 'Scaron'=>'352', 'scaron'=>'353', 'Yuml'=>'376', 'circ'=>'710', 'tilde'=>'732', 'ensp'=>'8194', 'emsp'=>'8195', 'thinsp'=>'8201', 'zwnj'=>'8204', 'zwj'=>'8205', 'lrm'=>'8206', 'rlm'=>'8207', 'ndash'=>'8211', 'mdash'=>'8212', 'lsquo'=>'8216', 'rsquo'=>'8217', 'sbquo'=>'8218', 'ldquo'=>'8220', 'rdquo'=>'8221', 'bdquo'=>'8222', 'dagger'=>'8224', 'Dagger'=>'8225', 'permil'=>'8240', 'lsaquo'=>'8249', 'rsaquo'=>'8250', 'euro'=>'8364', 'nbsp'=>'160', 'iexcl'=>'161', 'cent'=>'162', 'pound'=>'163', 'curren'=>'164', 'yen'=>'165', 'brvbar'=>'166', 'sect'=>'167', 'uml'=>'168', 'copy'=>'169', 'ordf'=>'170', 'laquo'=>'171', 'not'=>'172', 'shy'=>'173', 'reg'=>'174', 'macr'=>'175', 'deg'=>'176', 'plusmn'=>'177', 'sup2'=>'178', 'sup3'=>'179', 'acute'=>'180', 'micro'=>'181', 'para'=>'182', 'middot'=>'183', 'cedil'=>'184', 'sup1'=>'185', 'ordm'=>'186', 'raquo'=>'187', 'frac14'=>'188', 'frac12'=>'189', 'frac34'=>'190', 'iquest'=>'191', 'Agrave'=>'192', 'Aacute'=>'193', 'Acirc'=>'194', 'Atilde'=>'195', 'Auml'=>'196', 'Aring'=>'197', 'AElig'=>'198', 'Ccedil'=>'199', 'Egrave'=>'200', 'Eacute'=>'201', 'Ecirc'=>'202', 'Euml'=>'203', 'Igrave'=>'204', 'Iacute'=>'205', 'Icirc'=>'206', 'Iuml'=>'207', 'ETH'=>'208', 'Ntilde'=>'209', 'Ograve'=>'210', 'Oacute'=>'211', 'Ocirc'=>'212', 'Otilde'=>'213', 'Ouml'=>'214', 'times'=>'215', 'Oslash'=>'216', 'Ugrave'=>'217', 'Uacute'=>'218', 'Ucirc'=>'219', 'Uuml'=>'220', 'Yacute'=>'221', 'THORN'=>'222', 'szlig'=>'223', 'agrave'=>'224', 'aacute'=>'225', 'acirc'=>'226', 'atilde'=>'227', 'auml'=>'228', 'aring'=>'229', 'aelig'=>'230', 'ccedil'=>'231', 'egrave'=>'232', 'eacute'=>'233', 'ecirc'=>'234', 'euml'=>'235', 'igrave'=>'236', 'iacute'=>'237', 'icirc'=>'238', 'iuml'=>'239', 'eth'=>'240', 'ntilde'=>'241', 'ograve'=>'242', 'oacute'=>'243', 'ocirc'=>'244', 'otilde'=>'245', 'ouml'=>'246', 'divide'=>'247', 'oslash'=>'248', 'ugrave'=>'249', 'uacute'=>'250', 'ucirc'=>'251', 'uuml'=>'252', 'yacute'=>'253', 'thorn'=>'254', 'yuml'=>'255');
325if($t[0] != '#'){ 325if($t[0] != '#'){
326 return ($C['and_mark'] ? "\x06" : '&'). (isset($U[$t]) ? $t : (isset($N[$t]) ? (!$C['named_entity'] ? '#'. ($C['hexdec_entity'] > 1 ? 'x'. dechex($N[$t]) : $N[$t]) : $t) : 'amp;'. $t)). ';'; 326 return ($C['and_mark'] ? "\x06" : '&'). (isset($U[$t]) ? $t : (isset($N[$t]) ? (!$C['named_entity'] ? '#'. ($C['hexdec_entity'] > 1 ? 'x'. dechex($N[$t]) : $N[$t]) : $t) : 'amp;'. $t)). ';';
327} 327}
328if(($n = ctype_digit($t = substr($t, 1)) ? intval($t) : hexdec(substr($t, 1))) < 9 or ($n > 13 && $n < 32) or $n == 11 or $n == 12 or ($n > 126 && $n < 160 && $n != 133) or ($n > 55295 && ($n < 57344 or ($n > 64975 && $n < 64992) or $n == 65534 or $n == 65535 or $n > 1114111))){ 328if(($n = ctype_digit($t = substr($t, 1)) ? intval($t) : hexdec(substr($t, 1))) < 9 or ($n > 13 && $n < 32) or $n == 11 or $n == 12 or ($n > 126 && $n < 160 && $n != 133) or ($n > 55295 && ($n < 57344 or ($n > 64975 && $n < 64992) or $n == 65534 or $n == 65535 or $n > 1114111))){
329 return ($C['and_mark'] ? "\x06" : '&'). "amp;#{$t};"; 329 return ($C['and_mark'] ? "\x06" : '&'). "amp;#{$t};";
330} 330}
331return ($C['and_mark'] ? "\x06" : '&'). '#'. (((ctype_digit($t) && $C['hexdec_entity'] < 2) or !$C['hexdec_entity']) ? $n : 'x'. dechex($n)). ';'; 331return ($C['and_mark'] ? "\x06" : '&'). '#'. (((ctype_digit($t) && $C['hexdec_entity'] < 2) or !$C['hexdec_entity']) ? $n : 'x'. dechex($n)). ';';
332} 332}
333 333
334function hl_prot($p, $c=null){ 334function hl_prot($p, $c=null){
335// check URL scheme 335// check URL scheme
336global $C; 336global $C;
337$b = $a = ''; 337$b = $a = '';
338if($c == null){$c = 'style'; $b = $p[1]; $a = $p[3]; $p = trim($p[2]);} 338if($c == null){$c = 'style'; $b = $p[1]; $a = $p[3]; $p = trim($p[2]);}
339$c = isset($C['schemes'][$c]) ? $C['schemes'][$c] : $C['schemes']['*']; 339$c = isset($C['schemes'][$c]) ? $C['schemes'][$c] : $C['schemes']['*'];
340static $d = 'denied:'; 340static $d = 'denied:';
341if(isset($c['!']) && substr($p, 0, 7) != $d){$p = "$d$p";} 341if(isset($c['!']) && substr($p, 0, 7) != $d){$p = "$d$p";}
342if(isset($c['*']) or !strcspn($p, '#?;') or (substr($p, 0, 7) == $d)){return "{$b}{$p}{$a}";} // All ok, frag, query, param 342if(isset($c['*']) or !strcspn($p, '#?;') or (substr($p, 0, 7) == $d)){return "{$b}{$p}{$a}";} // All ok, frag, query, param
343if(preg_match('`^([^:?[@!$()*,=/\'\]]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot 343if(preg_match('`^([^:?[@!$()*,=/\'\]]+?)(:|&#(58|x3a);|%3a|\\\\0{0,4}3a).`i', $p, $m) && !isset($c[strtolower($m[1])])){ // Denied prot
344 return "{$b}{$d}{$p}{$a}"; 344 return "{$b}{$d}{$p}{$a}";
345} 345}
346if($C['abs_url']){ 346if($C['abs_url']){
347 if($C['abs_url'] == -1 && strpos($p, $C['base_url']) === 0){ // Make url rel 347 if($C['abs_url'] == -1 && strpos($p, $C['base_url']) === 0){ // Make url rel
348 $p = substr($p, strlen($C['base_url'])); 348 $p = substr($p, strlen($C['base_url']));
349 }elseif(empty($m[1])){ // Make URL abs 349 }elseif(empty($m[1])){ // Make URL abs
350 if(substr($p, 0, 2) == '//'){$p = substr($C['base_url'], 0, strpos($C['base_url'], ':')+1). $p;} 350 if(substr($p, 0, 2) == '//'){$p = substr($C['base_url'], 0, strpos($C['base_url'], ':')+1). $p;}
351 elseif($p[0] == '/'){$p = preg_replace('`(^.+?://[^/]+)(.*)`', '$1', $C['base_url']). $p;} 351 elseif($p[0] == '/'){$p = preg_replace('`(^.+?://[^/]+)(.*)`', '$1', $C['base_url']). $p;}
352 elseif(strcspn($p, './')){$p = $C['base_url']. $p;} 352 elseif(strcspn($p, './')){$p = $C['base_url']. $p;}
353 else{ 353 else{
354 preg_match('`^([a-zA-Z\d\-+.]+://[^/]+)(.*)`', $C['base_url'], $m); 354 preg_match('`^([a-zA-Z\d\-+.]+://[^/]+)(.*)`', $C['base_url'], $m);
355 $p = preg_replace('`(?<=/)\./`', '', $m[2]. $p); 355 $p = preg_replace('`(?<=/)\./`', '', $m[2]. $p);
356 while(preg_match('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', $p)){ 356 while(preg_match('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', $p)){
357 $p = preg_replace('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', '', $p); 357 $p = preg_replace('`(?<=/)([^/]{3,}|[^/.]+?|\.[^/.]|[^/.]\.)/\.\./`', '', $p);
358 } 358 }
359 $p = $m[1]. $p; 359 $p = $m[1]. $p;
360 } 360 }
361 } 361 }
362} 362}
363return "{$b}{$p}{$a}"; 363return "{$b}{$p}{$a}";
364} 364}
365 365
366function hl_regex($p){ 366function hl_regex($p){
367// check regex 367// check regex
368if(empty($p)){return 0;} 368if(empty($p)){return 0;}
369if($v = function_exists('error_clear_last') && function_exists('error_get_last')){error_clear_last();} 369if($v = function_exists('error_clear_last') && function_exists('error_get_last')){error_clear_last();}
370else{ 370else{
371 if($t = ini_get('track_errors')){$o = isset($php_errormsg) ? $php_errormsg : null;} 371 if($t = ini_get('track_errors')){$o = isset($php_errormsg) ? $php_errormsg : null;}
372 else{ini_set('track_errors', 1);} 372 else{ini_set('track_errors', 1);}
373 unset($php_errormsg); 373 unset($php_errormsg);
374} 374}
375if(($d = ini_get('display_errors'))){ini_set('display_errors', 0);} 375if(($d = ini_get('display_errors'))){ini_set('display_errors', 0);}
376preg_match($p, ''); 376preg_match($p, '');
377if($v){$r = error_get_last() == null ? 1 : 0; } 377if($v){$r = error_get_last() == null ? 1 : 0; }
378else{ 378else{
379 $r = isset($php_errormsg) ? 0 : 1; 379 $r = isset($php_errormsg) ? 0 : 1;
380 if($t){$php_errormsg = isset($o) ? $o : null;} 380 if($t){$php_errormsg = isset($o) ? $o : null;}
381 else{ini_set('track_errors', 0);} 381 else{ini_set('track_errors', 0);}
382} 382}
383if($d){ini_set('display_errors', 1);} 383if($d){ini_set('display_errors', 1);}
384return $r; 384return $r;
385} 385}
386 386
387function hl_spec($t){ 387function hl_spec($t){
388// final $spec 388// final $spec
389$s = array(); 389$s = array();
390if(!function_exists('hl_aux1')){function hl_aux1($m){ 390if(!function_exists('hl_aux1')){function hl_aux1($m){
391 return substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", '`"'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", '"'), $m[0]), 1, -1); 391 return substr(str_replace(array(";", "|", "~", " ", ",", "/", "(", ")", '`"'), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08", '"'), $m[0]), 1, -1);
392}} 392}}
393$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', 'hl_aux1', trim($t))); 393$t = str_replace(array("\t", "\r", "\n", ' '), '', preg_replace_callback('/"(?>(`.|[^"])*)"/sm', 'hl_aux1', trim($t)));
394for($i = count(($t = explode(';', $t))); --$i>=0;){ 394for($i = count(($t = explode(';', $t))); --$i>=0;){
395 $w = $t[$i]; 395 $w = $t[$i];
396 if(empty($w) or ($e = strpos($w, '=')) === false or !strlen(($a = substr($w, $e+1)))){continue;} 396 if(empty($w) or ($e = strpos($w, '=')) === false or !strlen(($a = substr($w, $e+1)))){continue;}
397 $y = $n = array(); 397 $y = $n = array();
398 foreach(explode(',', $a) as $v){ 398 foreach(explode(',', $a) as $v){
399 if(!preg_match('`^([a-z:\-\*]+)(?:\((.*?)\))?`i', $v, $m)){continue;} 399 if(!preg_match('`^([a-z:\-\*]+)(?:\((.*?)\))?`i', $v, $m)){continue;}
400 if(($x = strtolower($m[1])) == '-*'){$n['*'] = 1; continue;} 400 if(($x = strtolower($m[1])) == '-*'){$n['*'] = 1; continue;}
401 if($x[0] == '-'){$n[substr($x, 1)] = 1; continue;} 401 if($x[0] == '-'){$n[substr($x, 1)] = 1; continue;}
402 if(!isset($m[2])){$y[$x] = 1; continue;} 402 if(!isset($m[2])){$y[$x] = 1; continue;}
403 foreach(explode('/', $m[2]) as $m){ 403 foreach(explode('/', $m[2]) as $m){
404 if(empty($m) or ($p = strpos($m, '=')) == 0 or $p < 5){$y[$x] = 1; continue;} 404 if(empty($m) or ($p = strpos($m, '=')) == 0 or $p < 5){$y[$x] = 1; continue;}
405 $y[$x][strtolower(substr($m, 0, $p))] = str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08"), array(";", "|", "~", " ", ",", "/", "(", ")"), substr($m, $p+1)); 405 $y[$x][strtolower(substr($m, 0, $p))] = str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x06", "\x07", "\x08"), array(";", "|", "~", " ", ",", "/", "(", ")"), substr($m, $p+1));
406 } 406 }
407 if(isset($y[$x]['match']) && !hl_regex($y[$x]['match'])){unset($y[$x]['match']);} 407 if(isset($y[$x]['match']) && !hl_regex($y[$x]['match'])){unset($y[$x]['match']);}
408 if(isset($y[$x]['nomatch']) && !hl_regex($y[$x]['nomatch'])){unset($y[$x]['nomatch']);} 408 if(isset($y[$x]['nomatch']) && !hl_regex($y[$x]['nomatch'])){unset($y[$x]['nomatch']);}
409 } 409 }
410 if(!count($y) && !count($n)){continue;} 410 if(!count($y) && !count($n)){continue;}
411 foreach(explode(',', substr($w, 0, $e)) as $v){ 411 foreach(explode(',', substr($w, 0, $e)) as $v){
412 if(!strlen(($v = strtolower($v)))){continue;} 412 if(!strlen(($v = strtolower($v)))){continue;}
413 if(count($y)){if(!isset($s[$v])){$s[$v] = $y;} else{$s[$v] = array_merge($s[$v], $y);}} 413 if(count($y)){if(!isset($s[$v])){$s[$v] = $y;} else{$s[$v] = array_merge($s[$v], $y);}}
414 if(count($n)){if(!isset($s[$v]['n'])){$s[$v]['n'] = $n;} else{$s[$v]['n'] = array_merge($s[$v]['n'], $n);}} 414 if(count($n)){if(!isset($s[$v]['n'])){$s[$v]['n'] = $n;} else{$s[$v]['n'] = array_merge($s[$v]['n'], $n);}}
415 } 415 }
416} 416}
417return $s; 417return $s;
418} 418}
419 419
420function hl_tag($t){ 420function hl_tag($t){
421// tag/attribute handler 421// tag/attribute handler
422global $C; 422global $C;
423$t = $t[0]; 423$t = $t[0];
424// invalid < > 424// invalid < >
425if($t == '< '){return '&lt; ';} 425if($t == '< '){return '&lt; ';}
426if($t == '>'){return '&gt;';} 426if($t == '>'){return '&gt;';}
427if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){ 427if(!preg_match('`^<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>$`m', $t, $m)){
428 return str_replace(array('<', '>'), array('&lt;', '&gt;'), $t); 428 return str_replace(array('<', '>'), array('&lt;', '&gt;'), $t);
429}elseif(!isset($C['elements'][($e = strtolower($m[2]))])){ 429}elseif(!isset($C['elements'][($e = strtolower($m[2]))])){
430 return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : ''); 430 return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : '');
431} 431}
432// attr string 432// attr string
433$a = str_replace(array("\n", "\r", "\t"), ' ', trim($m[3])); 433$a = str_replace(array("\n", "\r", "\t"), ' ', trim($m[3]));
434// tag transform 434// tag transform
435static $eD = array('acronym'=>1, 'applet'=>1, 'big'=>1, 'center'=>1, 'dir'=>1, 'font'=>1, 'isindex'=>1, 's'=>1, 'strike'=>1, 'tt'=>1); // Deprecated 435static $eD = array('acronym'=>1, 'applet'=>1, 'big'=>1, 'center'=>1, 'dir'=>1, 'font'=>1, 'isindex'=>1, 's'=>1, 'strike'=>1, 'tt'=>1); // Deprecated
436if($C['make_tag_strict'] && isset($eD[$e])){ 436if($C['make_tag_strict'] && isset($eD[$e])){
437 $trt = hl_tag2($e, $a, $C['make_tag_strict']); 437 $trt = hl_tag2($e, $a, $C['make_tag_strict']);
438 if(!$e){return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : '');} 438 if(!$e){return (($C['keep_bad']%2) ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : '');}
439} 439}
440// close tag 440// close tag
441static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); // Empty ele 441static $eE = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); // Empty ele
442if(!empty($m[1])){ 442if(!empty($m[1])){
443 return (!isset($eE[$e]) ? (empty($C['hook_tag']) ? "</$e>" : $C['hook_tag']($e)) : (($C['keep_bad'])%2 ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : '')); 443 return (!isset($eE[$e]) ? (empty($C['hook_tag']) ? "</$e>" : $C['hook_tag']($e)) : (($C['keep_bad'])%2 ? str_replace(array('<', '>'), array('&lt;', '&gt;'), $t) : ''));
444} 444}
445 445
446// open tag & attr 446// open tag & attr
447static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accept-charset'=>array('form'=>1), 'action'=>array('form'=>1), 'align'=>array('applet'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'div'=>1, 'embed'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'async'=>array('script'=>1), 'autocomplete'=>array('form'=>1, 'input'=>1), 'autofocus'=>array('button'=>1, 'input'=>1, 'keygen'=>1, 'select'=>1, 'textarea'=>1), 'autoplay'=>array('audio'=>1, 'video'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1, 'table'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'challenge'=>array('keygen'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('command'=>1, 'input'=>1), 'cite'=>array('blockquote'=>1, 'del'=>1, 'ins'=>1, 'q'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('applet'=>1, 'object'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'content'=>array('meta'=>1), 'controls'=>array('audio'=>1, 'video'=>1), 'coords'=>array('a'=>1, 'area'=>1), 'crossorigin'=>array('img'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1, 'time'=>1), 'declare'=>array('object'=>1), 'default'=>array('track'=>1), 'defer'=>array('script'=>1), 'dirname'=>array('input'=>1, 'textarea'=>1), 'disabled'=>array('button'=>1, 'command'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'download'=>array('a'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'flashvars'=>array('embed'=>1), 'for'=>array('label'=>1, 'output'=>1), 'form'=>array('button'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'object'=>1, 'output'=>1, 'select'=>1, 'textarea'=>1), 'formaction'=>array('button'=>1, 'input'=>1), 'formenctype'=>array('button'=>1, 'input'=>1), 'formmethod'=>array('button'=>1, 'input'=>1), 'formnovalidate'=>array('button'=>1, 'input'=>1), 'formtarget'=>array('button'=>1, 'input'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('applet'=>1, 'canvas'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'high'=>array('meter'=>1), 'href'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hreflang'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'icon'=>array('command'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'keyparams'=>array('keygen'=>1), 'keytype'=>array('keygen'=>1), 'kind'=>array('track'=>1), 'label'=>array('command'=>1, 'menu'=>1, 'option'=>1, 'optgroup'=>1, 'track'=>1), 'language'=>array('script'=>1), 'list'=>array('input'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'loop'=>array('audio'=>1, 'video'=>1), 'low'=>array('meter'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'max'=>array('input'=>1, 'meter'=>1, 'progress'=>1), 'maxlength'=>array('input'=>1, 'textarea'=>1), 'media'=>array('a'=>1, 'area'=>1, 'link'=>1, 'source'=>1, 'style'=>1), 'mediagroup'=>array('audio'=>1, 'video'=>1), 'method'=>array('form'=>1), 'min'=>array('input'=>1, 'meter'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('input'=>1, 'select'=>1), 'muted'=>array('audio'=>1, 'video'=>1), 'name'=>array('a'=>1, 'applet'=>1, 'button'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'map'=>1, 'object'=>1, 'output'=>1, 'param'=>1, 'select'=>1, 'textarea'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'novalidate'=>array('form'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'open'=>array('details'=>1), 'optimum'=>array('meter'=>1), 'pattern'=>array('input'=>1), 'ping'=>array('a'=>1, 'area'=>1), 'placeholder'=>array('input'=>1, 'textarea'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'poster'=>array('video'=>1), 'pqg'=>array('keygen'=>1), 'preload'=>array('audio'=>1, 'video'=>1), 'prompt'=>array('isindex'=>1), 'pubdate'=>array('time'=>1), 'radiogroup'=>array('command'=>1), 'readonly'=>array('input'=>1, 'textarea'=>1), 'rel'=>array('a'=>1, 'area'=>1, 'link'=>1), 'required'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'rev'=>array('a'=>1), 'reversed'=>array('ol'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'sandbox'=>array('iframe'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scoped'=>array('style'=>1), 'scrolling'=>array('iframe'=>1), 'seamless'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('a'=>1, 'area'=>1), 'size'=>array('font'=>1, 'hr'=>1, 'input'=>1, 'select'=>1), 'sizes'=>array('link'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('audio'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'script'=>1, 'source'=>1, 'track'=>1, 'video'=>1), 'srcdoc'=>array('iframe'=>1), 'srclang'=>array('track'=>1), 'srcset'=>array('img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'step'=>array('input'=>1), 'summary'=>array('table'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'area'=>1, 'button'=>1, 'command'=>1, 'embed'=>1, 'input'=>1, 'li'=>1, 'link'=>1, 'menu'=>1, 'object'=>1, 'ol'=>1, 'param'=>1, 'script'=>1, 'source'=>1, 'style'=>1, 'ul'=>1), 'typemustmatch'=>array('object'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('button'=>1, 'data'=>1, 'input'=>1, 'li'=>1, 'meter'=>1, 'option'=>1, 'param'=>1, 'progress'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'width'=>array('applet'=>1, 'canvas'=>1, 'col'=>1, 'colgroup'=>1, 'embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'wmode'=>array('embed'=>1), 'wrap'=>array('textarea'=>1)); // Ele-specific 447static $aN = array('abbr'=>array('td'=>1, 'th'=>1), 'accept'=>array('form'=>1, 'input'=>1), 'accept-charset'=>array('form'=>1), 'action'=>array('form'=>1), 'align'=>array('applet'=>1, 'caption'=>1, 'col'=>1, 'colgroup'=>1, 'div'=>1, 'embed'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'allowfullscreen'=>array('iframe'=>1), 'alt'=>array('applet'=>1, 'area'=>1, 'img'=>1, 'input'=>1), 'archive'=>array('applet'=>1, 'object'=>1), 'async'=>array('script'=>1), 'autocomplete'=>array('form'=>1, 'input'=>1), 'autofocus'=>array('button'=>1, 'input'=>1, 'keygen'=>1, 'select'=>1, 'textarea'=>1), 'autoplay'=>array('audio'=>1, 'video'=>1), 'axis'=>array('td'=>1, 'th'=>1), 'bgcolor'=>array('embed'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('img'=>1, 'object'=>1, 'table'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellpadding'=>array('table'=>1), 'cellspacing'=>array('table'=>1), 'challenge'=>array('keygen'=>1), 'char'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charoff'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'charset'=>array('a'=>1, 'script'=>1), 'checked'=>array('command'=>1, 'input'=>1), 'cite'=>array('blockquote'=>1, 'del'=>1, 'ins'=>1, 'q'=>1), 'classid'=>array('object'=>1), 'clear'=>array('br'=>1), 'code'=>array('applet'=>1), 'codebase'=>array('applet'=>1, 'object'=>1), 'codetype'=>array('object'=>1), 'color'=>array('font'=>1), 'cols'=>array('textarea'=>1), 'colspan'=>array('td'=>1, 'th'=>1), 'compact'=>array('dir'=>1, 'dl'=>1, 'menu'=>1, 'ol'=>1, 'ul'=>1), 'content'=>array('meta'=>1), 'controls'=>array('audio'=>1, 'video'=>1), 'coords'=>array('a'=>1, 'area'=>1), 'crossorigin'=>array('img'=>1), 'data'=>array('object'=>1), 'datetime'=>array('del'=>1, 'ins'=>1, 'time'=>1), 'declare'=>array('object'=>1), 'default'=>array('track'=>1), 'defer'=>array('script'=>1), 'dirname'=>array('input'=>1, 'textarea'=>1), 'disabled'=>array('button'=>1, 'command'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'optgroup'=>1, 'option'=>1, 'select'=>1, 'textarea'=>1), 'download'=>array('a'=>1), 'enctype'=>array('form'=>1), 'face'=>array('font'=>1), 'flashvars'=>array('embed'=>1), 'for'=>array('label'=>1, 'output'=>1), 'form'=>array('button'=>1, 'fieldset'=>1, 'input'=>1, 'keygen'=>1, 'label'=>1, 'object'=>1, 'output'=>1, 'select'=>1, 'textarea'=>1), 'formaction'=>array('button'=>1, 'input'=>1), 'formenctype'=>array('button'=>1, 'input'=>1), 'formmethod'=>array('button'=>1, 'input'=>1), 'formnovalidate'=>array('button'=>1, 'input'=>1), 'formtarget'=>array('button'=>1, 'input'=>1), 'frame'=>array('table'=>1), 'frameborder'=>array('iframe'=>1), 'headers'=>array('td'=>1, 'th'=>1), 'height'=>array('applet'=>1, 'canvas'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'high'=>array('meter'=>1), 'href'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hreflang'=>array('a'=>1, 'area'=>1, 'link'=>1), 'hspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'icon'=>array('command'=>1), 'ismap'=>array('img'=>1, 'input'=>1), 'keyparams'=>array('keygen'=>1), 'keytype'=>array('keygen'=>1), 'kind'=>array('track'=>1), 'label'=>array('command'=>1, 'menu'=>1, 'option'=>1, 'optgroup'=>1, 'track'=>1), 'language'=>array('script'=>1), 'list'=>array('input'=>1), 'longdesc'=>array('img'=>1, 'iframe'=>1), 'loop'=>array('audio'=>1, 'video'=>1), 'low'=>array('meter'=>1), 'marginheight'=>array('iframe'=>1), 'marginwidth'=>array('iframe'=>1), 'max'=>array('input'=>1, 'meter'=>1, 'progress'=>1), 'maxlength'=>array('input'=>1, 'textarea'=>1), 'media'=>array('a'=>1, 'area'=>1, 'link'=>1, 'source'=>1, 'style'=>1), 'mediagroup'=>array('audio'=>1, 'video'=>1), 'method'=>array('form'=>1), 'min'=>array('input'=>1, 'meter'=>1), 'model'=>array('embed'=>1), 'multiple'=>array('input'=>1, 'select'=>1), 'muted'=>array('audio'=>1, 'video'=>1), 'name'=>array('a'=>1, 'applet'=>1, 'button'=>1, 'embed'=>1, 'fieldset'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'keygen'=>1, 'map'=>1, 'object'=>1, 'output'=>1, 'param'=>1, 'select'=>1, 'textarea'=>1), 'nohref'=>array('area'=>1), 'noshade'=>array('hr'=>1), 'novalidate'=>array('form'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'object'=>array('applet'=>1), 'open'=>array('details'=>1), 'optimum'=>array('meter'=>1), 'pattern'=>array('input'=>1), 'ping'=>array('a'=>1, 'area'=>1), 'placeholder'=>array('input'=>1, 'textarea'=>1), 'pluginspage'=>array('embed'=>1), 'pluginurl'=>array('embed'=>1), 'poster'=>array('video'=>1), 'pqg'=>array('keygen'=>1), 'preload'=>array('audio'=>1, 'video'=>1), 'prompt'=>array('isindex'=>1), 'pubdate'=>array('time'=>1), 'radiogroup'=>array('command'=>1), 'readonly'=>array('input'=>1, 'textarea'=>1), 'rel'=>array('a'=>1, 'area'=>1, 'link'=>1), 'required'=>array('input'=>1, 'select'=>1, 'textarea'=>1), 'rev'=>array('a'=>1), 'reversed'=>array('ol'=>1), 'rows'=>array('textarea'=>1), 'rowspan'=>array('td'=>1, 'th'=>1), 'rules'=>array('table'=>1), 'sandbox'=>array('iframe'=>1), 'scope'=>array('td'=>1, 'th'=>1), 'scoped'=>array('style'=>1), 'scrolling'=>array('iframe'=>1), 'seamless'=>array('iframe'=>1), 'selected'=>array('option'=>1), 'shape'=>array('a'=>1, 'area'=>1), 'size'=>array('font'=>1, 'hr'=>1, 'input'=>1, 'select'=>1), 'sizes'=>array('link'=>1), 'span'=>array('col'=>1, 'colgroup'=>1), 'src'=>array('audio'=>1, 'embed'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'script'=>1, 'source'=>1, 'track'=>1, 'video'=>1), 'srcdoc'=>array('iframe'=>1), 'srclang'=>array('track'=>1), 'srcset'=>array('img'=>1), 'standby'=>array('object'=>1), 'start'=>array('ol'=>1), 'step'=>array('input'=>1), 'summary'=>array('table'=>1), 'target'=>array('a'=>1, 'area'=>1, 'form'=>1), 'type'=>array('a'=>1, 'area'=>1, 'button'=>1, 'command'=>1, 'embed'=>1, 'input'=>1, 'li'=>1, 'link'=>1, 'menu'=>1, 'object'=>1, 'ol'=>1, 'param'=>1, 'script'=>1, 'source'=>1, 'style'=>1, 'ul'=>1), 'typemustmatch'=>array('object'=>1), 'usemap'=>array('img'=>1, 'input'=>1, 'object'=>1), 'valign'=>array('col'=>1, 'colgroup'=>1, 'tbody'=>1, 'td'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1), 'value'=>array('button'=>1, 'data'=>1, 'input'=>1, 'li'=>1, 'meter'=>1, 'option'=>1, 'param'=>1, 'progress'=>1), 'valuetype'=>array('param'=>1), 'vspace'=>array('applet'=>1, 'embed'=>1, 'img'=>1, 'object'=>1), 'width'=>array('applet'=>1, 'canvas'=>1, 'col'=>1, 'colgroup'=>1, 'embed'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'video'=>1), 'wmode'=>array('embed'=>1), 'wrap'=>array('textarea'=>1)); // Ele-specific
448static $aNA = array('aria-activedescendant'=>1, 'aria-atomic'=>1, 'aria-autocomplete'=>1, 'aria-busy'=>1, 'aria-checked'=>1, 'aria-controls'=>1, 'aria-describedby'=>1, 'aria-disabled'=>1, 'aria-dropeffect'=>1, 'aria-expanded'=>1, 'aria-flowto'=>1, 'aria-grabbed'=>1, 'aria-haspopup'=>1, 'aria-hidden'=>1, 'aria-invalid'=>1, 'aria-label'=>1, 'aria-labelledby'=>1, 'aria-level'=>1, 'aria-live'=>1, 'aria-multiline'=>1, 'aria-multiselectable'=>1, 'aria-orientation'=>1, 'aria-owns'=>1, 'aria-posinset'=>1, 'aria-pressed'=>1, 'aria-readonly'=>1, 'aria-relevant'=>1, 'aria-required'=>1, 'aria-selected'=>1, 'aria-setsize'=>1, 'aria-sort'=>1, 'aria-valuemax'=>1, 'aria-valuemin'=>1, 'aria-valuenow'=>1, 'aria-valuetext'=>1); // ARIA 448static $aNA = array('aria-activedescendant'=>1, 'aria-atomic'=>1, 'aria-autocomplete'=>1, 'aria-busy'=>1, 'aria-checked'=>1, 'aria-controls'=>1, 'aria-describedby'=>1, 'aria-disabled'=>1, 'aria-dropeffect'=>1, 'aria-expanded'=>1, 'aria-flowto'=>1, 'aria-grabbed'=>1, 'aria-haspopup'=>1, 'aria-hidden'=>1, 'aria-invalid'=>1, 'aria-label'=>1, 'aria-labelledby'=>1, 'aria-level'=>1, 'aria-live'=>1, 'aria-multiline'=>1, 'aria-multiselectable'=>1, 'aria-orientation'=>1, 'aria-owns'=>1, 'aria-posinset'=>1, 'aria-pressed'=>1, 'aria-readonly'=>1, 'aria-relevant'=>1, 'aria-required'=>1, 'aria-selected'=>1, 'aria-setsize'=>1, 'aria-sort'=>1, 'aria-valuemax'=>1, 'aria-valuemin'=>1, 'aria-valuenow'=>1, 'aria-valuetext'=>1); // ARIA
449static $aNE = array('allowfullscreen'=>1, 'checkbox'=>1, 'checked'=>1, 'command'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'default'=>1, 'disabled'=>1, 'hidden'=>1, 'inert'=>1, 'ismap'=>1, 'itemscope'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'open'=>1, 'radio'=>1, 'readonly'=>1, 'required'=>1, 'reversed'=>1, 'selected'=>1); // Empty 449static $aNE = array('allowfullscreen'=>1, 'checkbox'=>1, 'checked'=>1, 'command'=>1, 'compact'=>1, 'declare'=>1, 'defer'=>1, 'default'=>1, 'disabled'=>1, 'hidden'=>1, 'inert'=>1, 'ismap'=>1, 'itemscope'=>1, 'multiple'=>1, 'nohref'=>1, 'noresize'=>1, 'noshade'=>1, 'nowrap'=>1, 'open'=>1, 'radio'=>1, 'readonly'=>1, 'required'=>1, 'reversed'=>1, 'selected'=>1); // Empty
450static $aNO = array('onabort'=>1, 'onblur'=>1, 'oncanplay'=>1, 'oncanplaythrough'=>1, 'onchange'=>1, 'onclick'=>1, 'oncontextmenu'=>1, 'oncopy'=>1, 'oncuechange'=>1, 'oncut'=>1, 'ondblclick'=>1, 'ondrag'=>1, 'ondragend'=>1, 'ondragenter'=>1, 'ondragleave'=>1, 'ondragover'=>1, 'ondragstart'=>1, 'ondrop'=>1, 'ondurationchange'=>1, 'onemptied'=>1, 'onended'=>1, 'onerror'=>1, 'onfocus'=>1, 'onformchange'=>1, 'onforminput'=>1, 'oninput'=>1, 'oninvalid'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onload'=>1, 'onloadeddata'=>1, 'onloadedmetadata'=>1, 'onloadstart'=>1, 'onlostpointercapture'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onmousewheel'=>1, 'onpaste'=>1, 'onpause'=>1, 'onplay'=>1, 'onplaying'=>1, 'onpointercancel'=>1, 'ongotpointercapture'=>1, 'onpointerdown'=>1, 'onpointerenter'=>1, 'onpointerleave'=>1, 'onpointermove'=>1, 'onpointerout'=>1, 'onpointerover'=>1, 'onpointerup'=>1, 'onprogress'=>1, 'onratechange'=>1, 'onreadystatechange'=>1, 'onreset'=>1, 'onsearch'=>1, 'onscroll'=>1, 'onseeked'=>1, 'onseeking'=>1, 'onselect'=>1, 'onshow'=>1, 'onstalled'=>1, 'onsubmit'=>1, 'onsuspend'=>1, 'ontimeupdate'=>1, 'ontoggle'=>1, 'ontouchcancel'=>1, 'ontouchend'=>1, 'ontouchmove'=>1, 'ontouchstart'=>1, 'onvolumechange'=>1, 'onwaiting'=>1, 'onwheel'=>1); // Event 450static $aNO = array('onabort'=>1, 'onblur'=>1, 'oncanplay'=>1, 'oncanplaythrough'=>1, 'onchange'=>1, 'onclick'=>1, 'oncontextmenu'=>1, 'oncopy'=>1, 'oncuechange'=>1, 'oncut'=>1, 'ondblclick'=>1, 'ondrag'=>1, 'ondragend'=>1, 'ondragenter'=>1, 'ondragleave'=>1, 'ondragover'=>1, 'ondragstart'=>1, 'ondrop'=>1, 'ondurationchange'=>1, 'onemptied'=>1, 'onended'=>1, 'onerror'=>1, 'onfocus'=>1, 'onformchange'=>1, 'onforminput'=>1, 'oninput'=>1, 'oninvalid'=>1, 'onkeydown'=>1, 'onkeypress'=>1, 'onkeyup'=>1, 'onload'=>1, 'onloadeddata'=>1, 'onloadedmetadata'=>1, 'onloadstart'=>1, 'onlostpointercapture'=>1, 'onmousedown'=>1, 'onmousemove'=>1, 'onmouseout'=>1, 'onmouseover'=>1, 'onmouseup'=>1, 'onmousewheel'=>1, 'onpaste'=>1, 'onpause'=>1, 'onplay'=>1, 'onplaying'=>1, 'onpointercancel'=>1, 'ongotpointercapture'=>1, 'onpointerdown'=>1, 'onpointerenter'=>1, 'onpointerleave'=>1, 'onpointermove'=>1, 'onpointerout'=>1, 'onpointerover'=>1, 'onpointerup'=>1, 'onprogress'=>1, 'onratechange'=>1, 'onreadystatechange'=>1, 'onreset'=>1, 'onsearch'=>1, 'onscroll'=>1, 'onseeked'=>1, 'onseeking'=>1, 'onselect'=>1, 'onshow'=>1, 'onstalled'=>1, 'onsubmit'=>1, 'onsuspend'=>1, 'ontimeupdate'=>1, 'ontoggle'=>1, 'ontouchcancel'=>1, 'ontouchend'=>1, 'ontouchmove'=>1, 'ontouchstart'=>1, 'onvolumechange'=>1, 'onwaiting'=>1, 'onwheel'=>1); // Event
451static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'itemtype'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'src'=>1, 'srcset'=>1, 'usemap'=>1); // Need scheme check; excludes style, on* 451static $aNP = array('action'=>1, 'cite'=>1, 'classid'=>1, 'codebase'=>1, 'data'=>1, 'href'=>1, 'itemtype'=>1, 'longdesc'=>1, 'model'=>1, 'pluginspage'=>1, 'pluginurl'=>1, 'src'=>1, 'srcset'=>1, 'usemap'=>1); // Need scheme check; excludes style, on*
452static $aNU = array('accesskey'=>1, 'class'=>1, 'contenteditable'=>1, 'contextmenu'=>1, 'dir'=>1, 'draggable'=>1, 'dropzone'=>1, 'hidden'=>1, 'id'=>1, 'inert'=>1, 'itemid'=>1, 'itemprop'=>1, 'itemref'=>1, 'itemscope'=>1, 'itemtype'=>1, 'lang'=>1, 'role'=>1, 'spellcheck'=>1, 'style'=>1, 'tabindex'=>1, 'title'=>1, 'translate'=>1, 'xmlns'=>1, 'xml:base'=>1, 'xml:lang'=>1, 'xml:space'=>1); // Univ; excludes on*, aria* 452static $aNU = array('accesskey'=>1, 'class'=>1, 'contenteditable'=>1, 'contextmenu'=>1, 'dir'=>1, 'draggable'=>1, 'dropzone'=>1, 'hidden'=>1, 'id'=>1, 'inert'=>1, 'itemid'=>1, 'itemprop'=>1, 'itemref'=>1, 'itemscope'=>1, 'itemtype'=>1, 'lang'=>1, 'role'=>1, 'spellcheck'=>1, 'style'=>1, 'tabindex'=>1, 'title'=>1, 'translate'=>1, 'xmlns'=>1, 'xml:base'=>1, 'xml:lang'=>1, 'xml:space'=>1); // Univ; excludes on*, aria*
453 453
454if($C['lc_std_val']){ 454if($C['lc_std_val']){
455 // predef attr vals for $eAL & $aNE ele 455 // predef attr vals for $eAL & $aNE ele
456 static $aNL = array('all'=>1, 'auto'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'captions'=>1, 'center'=>1, 'chapters'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'color'=>1, 'cols'=>1, 'data'=>1, 'date'=>1, 'datetime'=>1, 'datetime-local'=>1, 'default'=>1, 'descriptions'=>1, 'email'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'metadata'=>1, 'middle'=>1, 'month'=>1, 'none'=>1, 'number'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'range'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'search'=>1, 'submit'=>1, 'subtitles'=>1, 'tel'=>1, 'text'=>1, 'time'=>1, 'top'=>1, 'url'=>1, 'week'=>1); 456 static $aNL = array('all'=>1, 'auto'=>1, 'baseline'=>1, 'bottom'=>1, 'button'=>1, 'captions'=>1, 'center'=>1, 'chapters'=>1, 'char'=>1, 'checkbox'=>1, 'circle'=>1, 'col'=>1, 'colgroup'=>1, 'color'=>1, 'cols'=>1, 'data'=>1, 'date'=>1, 'datetime'=>1, 'datetime-local'=>1, 'default'=>1, 'descriptions'=>1, 'email'=>1, 'file'=>1, 'get'=>1, 'groups'=>1, 'hidden'=>1, 'image'=>1, 'justify'=>1, 'left'=>1, 'ltr'=>1, 'metadata'=>1, 'middle'=>1, 'month'=>1, 'none'=>1, 'number'=>1, 'object'=>1, 'password'=>1, 'poly'=>1, 'post'=>1, 'preserve'=>1, 'radio'=>1, 'range'=>1, 'rect'=>1, 'ref'=>1, 'reset'=>1, 'right'=>1, 'row'=>1, 'rowgroup'=>1, 'rows'=>1, 'rtl'=>1, 'search'=>1, 'submit'=>1, 'subtitles'=>1, 'tel'=>1, 'text'=>1, 'time'=>1, 'top'=>1, 'url'=>1, 'week'=>1);
457 static $eAL = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'fieldset'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'track'=>1, 'xml:space'=>1); 457 static $eAL = array('a'=>1, 'area'=>1, 'bdo'=>1, 'button'=>1, 'col'=>1, 'fieldset'=>1, 'form'=>1, 'img'=>1, 'input'=>1, 'object'=>1, 'ol'=>1, 'optgroup'=>1, 'option'=>1, 'param'=>1, 'script'=>1, 'select'=>1, 'table'=>1, 'td'=>1, 'textarea'=>1, 'tfoot'=>1, 'th'=>1, 'thead'=>1, 'tr'=>1, 'track'=>1, 'xml:space'=>1);
458 $lcase = isset($eAL[$e]) ? 1 : 0; 458 $lcase = isset($eAL[$e]) ? 1 : 0;
459} 459}
460 460
461$depTr = 0; 461$depTr = 0;
462if($C['no_deprecated_attr']){ 462if($C['no_deprecated_attr']){
463 // depr attr:applicable ele 463 // depr attr:applicable ele
464 static $aND = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellspacing'=>array('table'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1)); 464 static $aND = array('align'=>array('caption'=>1, 'div'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'object'=>1, 'p'=>1, 'table'=>1), 'bgcolor'=>array('table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1), 'border'=>array('object'=>1), 'bordercolor'=>array('table'=>1, 'td'=>1, 'tr'=>1), 'cellspacing'=>array('table'=>1), 'clear'=>array('br'=>1), 'compact'=>array('dl'=>1, 'ol'=>1, 'ul'=>1), 'height'=>array('td'=>1, 'th'=>1), 'hspace'=>array('img'=>1, 'object'=>1), 'language'=>array('script'=>1), 'name'=>array('a'=>1, 'form'=>1, 'iframe'=>1, 'img'=>1, 'map'=>1), 'noshade'=>array('hr'=>1), 'nowrap'=>array('td'=>1, 'th'=>1), 'size'=>array('hr'=>1), 'vspace'=>array('img'=>1, 'object'=>1), 'width'=>array('hr'=>1, 'pre'=>1, 'table'=>1, 'td'=>1, 'th'=>1));
465 static $eAD = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1); 465 static $eAD = array('a'=>1, 'br'=>1, 'caption'=>1, 'div'=>1, 'dl'=>1, 'form'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'hr'=>1, 'iframe'=>1, 'img'=>1, 'input'=>1, 'legend'=>1, 'map'=>1, 'object'=>1, 'ol'=>1, 'p'=>1, 'pre'=>1, 'script'=>1, 'table'=>1, 'td'=>1, 'th'=>1, 'tr'=>1, 'ul'=>1);
466 $depTr = isset($eAD[$e]) ? 1 : 0; 466 $depTr = isset($eAD[$e]) ? 1 : 0;
467} 467}
468 468
469// attr name-vals 469// attr name-vals
470if(strpos($a, "\x01") !== false){$a = preg_replace('`\x01[^\x01]*\x01`', '', $a);} // No comment/CDATA sec 470if(strpos($a, "\x01") !== false){$a = preg_replace('`\x01[^\x01]*\x01`', '', $a);} // No comment/CDATA sec
471$mode = 0; $a = trim($a, ' /'); $aA = array(); 471$mode = 0; $a = trim($a, ' /'); $aA = array();
472while(strlen($a)){ 472while(strlen($a)){
473 $w = 0; 473 $w = 0;
474 switch($mode){ 474 switch($mode){
475 case 0: // Name 475 case 0: // Name
476 if(preg_match('`^[a-zA-Z][^\s=/]+`', $a, $m)){ 476 if(preg_match('`^[a-zA-Z][^\s=/]+`', $a, $m)){
477 $nm = strtolower($m[0]); 477 $nm = strtolower($m[0]);
478 $w = $mode = 1; $a = ltrim(substr_replace($a, '', 0, strlen($m[0]))); 478 $w = $mode = 1; $a = ltrim(substr_replace($a, '', 0, strlen($m[0])));
479 } 479 }
480 break; case 1: 480 break; case 1:
481 if($a[0] == '='){ // = 481 if($a[0] == '='){ // =
482 $w = 1; $mode = 2; $a = ltrim($a, '= '); 482 $w = 1; $mode = 2; $a = ltrim($a, '= ');
483 }else{ // No val 483 }else{ // No val
484 $w = 1; $mode = 0; $a = ltrim($a); 484 $w = 1; $mode = 0; $a = ltrim($a);
485 $aA[$nm] = ''; 485 $aA[$nm] = '';
486 } 486 }
487 break; case 2: // Val 487 break; case 2: // Val
488 if(preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $a, $m)){ 488 if(preg_match('`^((?:"[^"]*")|(?:\'[^\']*\')|(?:\s*[^\s"\']+))(.*)`', $a, $m)){
489 $a = ltrim($m[2]); $m = $m[1]; $w = 1; $mode = 0; 489 $a = ltrim($m[2]); $m = $m[1]; $w = 1; $mode = 0;
490 $aA[$nm] = trim(str_replace('<', '&lt;', ($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m)); 490 $aA[$nm] = trim(str_replace('<', '&lt;', ($m[0] == '"' or $m[0] == '\'') ? substr($m, 1, -1) : $m));
491 } 491 }
492 break; 492 break;
493 } 493 }
494 if($w == 0){ // Parse errs, deal with space, " & ' 494 if($w == 0){ // Parse errs, deal with space, " & '
495 $a = preg_replace('`^(?:"[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*`', '', $a); 495 $a = preg_replace('`^(?:"[^"]*("|$)|\'[^\']*(\'|$)|\S)*\s*`', '', $a);
496 $mode = 0; 496 $mode = 0;
497 } 497 }
498} 498}
499if($mode == 1){$aA[$nm] = '';} 499if($mode == 1){$aA[$nm] = '';}
500 500
501// clean attrs 501// clean attrs
502global $S; 502global $S;
503$rl = isset($S[$e]) ? $S[$e] : array(); 503$rl = isset($S[$e]) ? $S[$e] : array();
504$a = array(); $nfr = 0; $d = $C['deny_attribute']; 504$a = array(); $nfr = 0; $d = $C['deny_attribute'];
505foreach($aA as $k=>$v){ 505foreach($aA as $k=>$v){
506 if(((isset($d['*']) ? isset($d[$k]) : !isset($d[$k])) && (isset($aN[$k][$e]) or isset($aNU[$k]) or (isset($aNO[$k]) && !isset($d['on*'])) or (isset($aNA[$k]) && !isset($d['aria*'])) or (!isset($d['data*']) && preg_match('`data-((?!xml)[^:]+$)`', $k))) && !isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])){ 506 if(((isset($d['*']) ? isset($d[$k]) : !isset($d[$k])) && (isset($aN[$k][$e]) or isset($aNU[$k]) or (isset($aNO[$k]) && !isset($d['on*'])) or (isset($aNA[$k]) && !isset($d['aria*'])) or (!isset($d['data*']) && preg_match('`data-((?!xml)[^:]+$)`', $k))) && !isset($rl['n'][$k]) && !isset($rl['n']['*'])) or isset($rl[$k])){
507 if(isset($aNE[$k])){$v = $k;} 507 if(isset($aNE[$k])){$v = $k;}
508 elseif(!empty($lcase) && (($e != 'button' or $e != 'input') or $k == 'type')){ // Rather loose but ?not cause issues 508 elseif(!empty($lcase) && (($e != 'button' or $e != 'input') or $k == 'type')){ // Rather loose but ?not cause issues
509 $v = (isset($aNL[($v2 = strtolower($v))])) ? $v2 : $v; 509 $v = (isset($aNL[($v2 = strtolower($v))])) ? $v2 : $v;
510 } 510 }
511 if($k == 'style' && !$C['style_pass']){ 511 if($k == 'style' && !$C['style_pass']){
512 if(false !== strpos($v, '&#')){ 512 if(false !== strpos($v, '&#')){
513 static $sC = array('&#x20;'=>' ', '&#32;'=>' ', '&#x45;'=>'e', '&#69;'=>'e', '&#x65;'=>'e', '&#101;'=>'e', '&#x58;'=>'x', '&#88;'=>'x', '&#x78;'=>'x', '&#120;'=>'x', '&#x50;'=>'p', '&#80;'=>'p', '&#x70;'=>'p', '&#112;'=>'p', '&#x53;'=>'s', '&#83;'=>'s', '&#x73;'=>'s', '&#115;'=>'s', '&#x49;'=>'i', '&#73;'=>'i', '&#x69;'=>'i', '&#105;'=>'i', '&#x4f;'=>'o', '&#79;'=>'o', '&#x6f;'=>'o', '&#111;'=>'o', '&#x4e;'=>'n', '&#78;'=>'n', '&#x6e;'=>'n', '&#110;'=>'n', '&#x55;'=>'u', '&#85;'=>'u', '&#x75;'=>'u', '&#117;'=>'u', '&#x52;'=>'r', '&#82;'=>'r', '&#x72;'=>'r', '&#114;'=>'r', '&#x4c;'=>'l', '&#76;'=>'l', '&#x6c;'=>'l', '&#108;'=>'l', '&#x28;'=>'(', '&#40;'=>'(', '&#x29;'=>')', '&#41;'=>')', '&#x20;'=>':', '&#32;'=>':', '&#x22;'=>'"', '&#34;'=>'"', '&#x27;'=>"'", '&#39;'=>"'", '&#x2f;'=>'/', '&#47;'=>'/', '&#x2a;'=>'*', '&#42;'=>'*', '&#x5c;'=>'\\', '&#92;'=>'\\'); 513 static $sC = array('&#x20;'=>' ', '&#32;'=>' ', '&#x45;'=>'e', '&#69;'=>'e', '&#x65;'=>'e', '&#101;'=>'e', '&#x58;'=>'x', '&#88;'=>'x', '&#x78;'=>'x', '&#120;'=>'x', '&#x50;'=>'p', '&#80;'=>'p', '&#x70;'=>'p', '&#112;'=>'p', '&#x53;'=>'s', '&#83;'=>'s', '&#x73;'=>'s', '&#115;'=>'s', '&#x49;'=>'i', '&#73;'=>'i', '&#x69;'=>'i', '&#105;'=>'i', '&#x4f;'=>'o', '&#79;'=>'o', '&#x6f;'=>'o', '&#111;'=>'o', '&#x4e;'=>'n', '&#78;'=>'n', '&#x6e;'=>'n', '&#110;'=>'n', '&#x55;'=>'u', '&#85;'=>'u', '&#x75;'=>'u', '&#117;'=>'u', '&#x52;'=>'r', '&#82;'=>'r', '&#x72;'=>'r', '&#114;'=>'r', '&#x4c;'=>'l', '&#76;'=>'l', '&#x6c;'=>'l', '&#108;'=>'l', '&#x28;'=>'(', '&#40;'=>'(', '&#x29;'=>')', '&#41;'=>')', '&#x20;'=>':', '&#32;'=>':', '&#x22;'=>'"', '&#34;'=>'"', '&#x27;'=>"'", '&#39;'=>"'", '&#x2f;'=>'/', '&#47;'=>'/', '&#x2a;'=>'*', '&#42;'=>'*', '&#x5c;'=>'\\', '&#92;'=>'\\');
514 $v = strtr($v, $sC); 514 $v = strtr($v, $sC);
515 } 515 }
516 $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'hl_prot', $v); 516 $v = preg_replace_callback('`(url(?:\()(?: )*(?:\'|"|&(?:quot|apos);)?)(.+?)((?:\'|"|&(?:quot|apos);)?(?: )*(?:\)))`iS', 'hl_prot', $v);
517 $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v; 517 $v = !$C['css_expression'] ? preg_replace('`expression`i', ' ', preg_replace('`\\\\\S|(/|(%2f))(\*|(%2a))`i', ' ', $v)) : $v;
518 }elseif(isset($aNP[$k]) or isset($aNO[$k])){ 518 }elseif(isset($aNP[$k]) or isset($aNO[$k])){
519 $v = str_replace("­", ' ', (strpos($v, '&') !== false ? str_replace(array('&#xad;', '&#173;', '&shy;'), ' ', $v) : $v)); # double-quoted char: soft-hyphen; appears here as "­" or hyphen or something else depending on viewing software 519 $v = str_replace("­", ' ', (strpos($v, '&') !== false ? str_replace(array('&#xad;', '&#173;', '&shy;'), ' ', $v) : $v)); # double-quoted char: soft-hyphen; appears here as "­" or hyphen or something else depending on viewing software
520 if($k == 'srcset'){ 520 if($k == 'srcset'){
521 $v2 = ''; 521 $v2 = '';
522 foreach(explode(',', $v) as $k1=>$v1){ 522 foreach(explode(',', $v) as $k1=>$v1){
523 $v1 = explode(' ', ltrim($v1), 2); 523 $v1 = explode(' ', ltrim($v1), 2);
524 $k1 = isset($v1[1]) ? trim($v1[1]) : ''; 524 $k1 = isset($v1[1]) ? trim($v1[1]) : '';
525 $v1 = trim($v1[0]); 525 $v1 = trim($v1[0]);
526 if(isset($v1[0])){$v2 .= hl_prot($v1, $k). (empty($k1) ? '' : ' '. $k1). ', ';} 526 if(isset($v1[0])){$v2 .= hl_prot($v1, $k). (empty($k1) ? '' : ' '. $k1). ', ';}
527 } 527 }
528 $v = trim($v2, ', '); 528 $v = trim($v2, ', ');
529 } 529 }
530 if($k == 'itemtype'){ 530 if($k == 'itemtype'){
531 $v2 = ''; 531 $v2 = '';
532 foreach(explode(' ', $v) as $v1){ 532 foreach(explode(' ', $v) as $v1){
533 if(isset($v1[0])){$v2 .= hl_prot($v1, $k). ' ';} 533 if(isset($v1[0])){$v2 .= hl_prot($v1, $k). ' ';}
534 } 534 }
535 $v = trim($v2, ' '); 535 $v = trim($v2, ' ');
536 } 536 }
537 else{$v = hl_prot($v, $k);} 537 else{$v = hl_prot($v, $k);}
538 if($k == 'href'){ // X-spam 538 if($k == 'href'){ // X-spam
539 if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){ 539 if($C['anti_mail_spam'] && strpos($v, 'mailto:') === 0){
540 $v = str_replace('@', htmlspecialchars($C['anti_mail_spam']), $v); 540 $v = str_replace('@', htmlspecialchars($C['anti_mail_spam']), $v);
541 }elseif($C['anti_link_spam']){ 541 }elseif($C['anti_link_spam']){
542 $r1 = $C['anti_link_spam'][1]; 542 $r1 = $C['anti_link_spam'][1];
543 if(!empty($r1) && preg_match($r1, $v)){continue;} 543 if(!empty($r1) && preg_match($r1, $v)){continue;}
544 $r0 = $C['anti_link_spam'][0]; 544 $r0 = $C['anti_link_spam'][0];
545 if(!empty($r0) && preg_match($r0, $v)){ 545 if(!empty($r0) && preg_match($r0, $v)){
546 if(isset($a['rel'])){ 546 if(isset($a['rel'])){
547 if(!preg_match('`\bnofollow\b`i', $a['rel'])){$a['rel'] .= ' nofollow';} 547 if(!preg_match('`\bnofollow\b`i', $a['rel'])){$a['rel'] .= ' nofollow';}
548 }elseif(isset($aA['rel'])){ 548 }elseif(isset($aA['rel'])){
549 if(!preg_match('`\bnofollow\b`i', $aA['rel'])){$nfr = 1;} 549 if(!preg_match('`\bnofollow\b`i', $aA['rel'])){$nfr = 1;}
550 }else{$a['rel'] = 'nofollow';} 550 }else{$a['rel'] = 'nofollow';}
551 } 551 }
552 } 552 }
553 } 553 }
554 } 554 }
555 if(isset($rl[$k]) && is_array($rl[$k]) && ($v = hl_attrval($k, $v, $rl[$k])) === 0){continue;} 555 if(isset($rl[$k]) && is_array($rl[$k]) && ($v = hl_attrval($k, $v, $rl[$k])) === 0){continue;}
556 $a[$k] = str_replace('"', '&quot;', $v); 556 $a[$k] = str_replace('"', '&quot;', $v);
557 } 557 }
558} 558}
559if($nfr){$a['rel'] = isset($a['rel']) ? $a['rel']. ' nofollow' : 'nofollow';} 559if($nfr){$a['rel'] = isset($a['rel']) ? $a['rel']. ' nofollow' : 'nofollow';}
560 560
561// rqd attr 561// rqd attr
562static $eAR = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'command'=>array('label'=>''), 'form'=>array('action'=>''), 'img'=>array('src'=>'', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'style'=>array('scoped'=>''), 'textarea'=>array('rows'=>'10', 'cols'=>'50')); 562static $eAR = array('area'=>array('alt'=>'area'), 'bdo'=>array('dir'=>'ltr'), 'command'=>array('label'=>''), 'form'=>array('action'=>''), 'img'=>array('src'=>'', 'alt'=>'image'), 'map'=>array('name'=>''), 'optgroup'=>array('label'=>''), 'param'=>array('name'=>''), 'style'=>array('scoped'=>''), 'textarea'=>array('rows'=>'10', 'cols'=>'50'));
563if(isset($eAR[$e])){ 563if(isset($eAR[$e])){
564 foreach($eAR[$e] as $k=>$v){ 564 foreach($eAR[$e] as $k=>$v){
565 if(!isset($a[$k])){$a[$k] = isset($v[0]) ? $v : $k;} 565 if(!isset($a[$k])){$a[$k] = isset($v[0]) ? $v : $k;}
566 } 566 }
567} 567}
568 568
569// depr attr 569// depr attr
570if($depTr){ 570if($depTr){
571 $c = array(); 571 $c = array();
572 foreach($a as $k=>$v){ 572 foreach($a as $k=>$v){
573 if($k == 'style' or !isset($aND[$k][$e])){continue;} 573 if($k == 'style' or !isset($aND[$k][$e])){continue;}
574 $v = str_replace(array('\\', ':', ';', '&#'), '', $v); 574 $v = str_replace(array('\\', ':', ';', '&#'), '', $v);
575 if($k == 'align'){ 575 if($k == 'align'){
576 unset($a['align']); 576 unset($a['align']);
577 if($e == 'img' && ($v == 'left' or $v == 'right')){$c[] = 'float: '. $v;} 577 if($e == 'img' && ($v == 'left' or $v == 'right')){$c[] = 'float: '. $v;}
578 elseif(($e == 'div' or $e == 'table') && $v == 'center'){$c[] = 'margin: auto';} 578 elseif(($e == 'div' or $e == 'table') && $v == 'center'){$c[] = 'margin: auto';}
579 else{$c[] = 'text-align: '. $v;} 579 else{$c[] = 'text-align: '. $v;}
580 }elseif($k == 'bgcolor'){ 580 }elseif($k == 'bgcolor'){
581 unset($a['bgcolor']); 581 unset($a['bgcolor']);
582 $c[] = 'background-color: '. $v; 582 $c[] = 'background-color: '. $v;
583 }elseif($k == 'border'){ 583 }elseif($k == 'border'){
584 unset($a['border']); $c[] = "border: {$v}px"; 584 unset($a['border']); $c[] = "border: {$v}px";
585 }elseif($k == 'bordercolor'){ 585 }elseif($k == 'bordercolor'){
586 unset($a['bordercolor']); $c[] = 'border-color: '. $v; 586 unset($a['bordercolor']); $c[] = 'border-color: '. $v;
587 }elseif($k == 'cellspacing'){ 587 }elseif($k == 'cellspacing'){
588 unset($a['cellspacing']); $c[] = "border-spacing: {$v}px"; 588 unset($a['cellspacing']); $c[] = "border-spacing: {$v}px";
589 }elseif($k == 'clear'){ 589 }elseif($k == 'clear'){
590 unset($a['clear']); $c[] = 'clear: '. ($v != 'all' ? $v : 'both'); 590 unset($a['clear']); $c[] = 'clear: '. ($v != 'all' ? $v : 'both');
591 }elseif($k == 'compact'){ 591 }elseif($k == 'compact'){
592 unset($a['compact']); $c[] = 'font-size: 85%'; 592 unset($a['compact']); $c[] = 'font-size: 85%';
593 }elseif($k == 'height' or $k == 'width'){ 593 }elseif($k == 'height' or $k == 'width'){
594 unset($a[$k]); $c[] = $k. ': '. ($v[0] != '*' ? $v. (ctype_digit($v) ? 'px' : '') : 'auto'); 594 unset($a[$k]); $c[] = $k. ': '. ($v[0] != '*' ? $v. (ctype_digit($v) ? 'px' : '') : 'auto');
595 }elseif($k == 'hspace'){ 595 }elseif($k == 'hspace'){
596 unset($a['hspace']); $c[] = "margin-left: {$v}px; margin-right: {$v}px"; 596 unset($a['hspace']); $c[] = "margin-left: {$v}px; margin-right: {$v}px";
597 }elseif($k == 'language' && !isset($a['type'])){ 597 }elseif($k == 'language' && !isset($a['type'])){
598 unset($a['language']); 598 unset($a['language']);
599 $a['type'] = 'text/'. strtolower($v); 599 $a['type'] = 'text/'. strtolower($v);
600 }elseif($k == 'name'){ 600 }elseif($k == 'name'){
601 if($C['no_deprecated_attr'] == 2 or ($e != 'a' && $e != 'map')){unset($a['name']);} 601 if($C['no_deprecated_attr'] == 2 or ($e != 'a' && $e != 'map')){unset($a['name']);}
602 if(!isset($a['id']) && !preg_match('`\W`', $v)){$a['id'] = $v;} 602 if(!isset($a['id']) && !preg_match('`\W`', $v)){$a['id'] = $v;}
603 }elseif($k == 'noshade'){ 603 }elseif($k == 'noshade'){
604 unset($a['noshade']); $c[] = 'border-style: none; border: 0; background-color: gray; color: gray'; 604 unset($a['noshade']); $c[] = 'border-style: none; border: 0; background-color: gray; color: gray';
605 }elseif($k == 'nowrap'){ 605 }elseif($k == 'nowrap'){
606 unset($a['nowrap']); $c[] = 'white-space: nowrap'; 606 unset($a['nowrap']); $c[] = 'white-space: nowrap';
607 }elseif($k == 'size'){ 607 }elseif($k == 'size'){
608 unset($a['size']); $c[] = 'size: '. $v. 'px'; 608 unset($a['size']); $c[] = 'size: '. $v. 'px';
609 }elseif($k == 'vspace'){ 609 }elseif($k == 'vspace'){
610 unset($a['vspace']); $c[] = "margin-top: {$v}px; margin-bottom: {$v}px"; 610 unset($a['vspace']); $c[] = "margin-top: {$v}px; margin-bottom: {$v}px";
611 } 611 }
612 } 612 }
613 if(count($c)){ 613 if(count($c)){
614 $c = implode('; ', $c); 614 $c = implode('; ', $c);
615 $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;'). '; '. $c. ';': $c. ';'; 615 $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;'). '; '. $c. ';': $c. ';';
616 } 616 }
617} 617}
618// unique ID 618// unique ID
619if($C['unique_ids'] && isset($a['id'])){ 619if($C['unique_ids'] && isset($a['id'])){
620 if(preg_match('`\s`', ($id = $a['id'])) or (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)){unset($a['id']); 620 if(preg_match('`\s`', ($id = $a['id'])) or (isset($GLOBALS['hl_Ids'][$id]) && $C['unique_ids'] == 1)){unset($a['id']);
621 }else{ 621 }else{
622 while(isset($GLOBALS['hl_Ids'][$id])){$id = $C['unique_ids']. $id;} 622 while(isset($GLOBALS['hl_Ids'][$id])){$id = $C['unique_ids']. $id;}
623 $GLOBALS['hl_Ids'][($a['id'] = $id)] = 1; 623 $GLOBALS['hl_Ids'][($a['id'] = $id)] = 1;
624 } 624 }
625} 625}
626// xml:lang 626// xml:lang
627if($C['xml:lang'] && isset($a['lang'])){ 627if($C['xml:lang'] && isset($a['lang'])){
628 $a['xml:lang'] = isset($a['xml:lang']) ? $a['xml:lang'] : $a['lang']; 628 $a['xml:lang'] = isset($a['xml:lang']) ? $a['xml:lang'] : $a['lang'];
629 if($C['xml:lang'] == 2){unset($a['lang']);} 629 if($C['xml:lang'] == 2){unset($a['lang']);}
630} 630}
631// for transformed tag 631// for transformed tag
632if(!empty($trt)){ 632if(!empty($trt)){
633 $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;'). '; '. $trt : $trt; 633 $a['style'] = isset($a['style']) ? rtrim($a['style'], ' ;'). '; '. $trt : $trt;
634} 634}
635// return with empty ele / 635// return with empty ele /
636if(empty($C['hook_tag'])){ 636if(empty($C['hook_tag'])){
637 $aA = ''; 637 $aA = '';
638 foreach($a as $k=>$v){$aA .= " {$k}=\"{$v}\"";} 638 foreach($a as $k=>$v){$aA .= " {$k}=\"{$v}\"";}
639 return "<{$e}{$aA}". (isset($eE[$e]) ? ' /' : ''). '>'; 639 return "<{$e}{$aA}". (isset($eE[$e]) ? ' /' : ''). '>';
640} 640}
641else{return $C['hook_tag']($e, $a);} 641else{return $C['hook_tag']($e, $a);}
642} 642}
643 643
644function hl_tag2(&$e, &$a, $t=1){ 644function hl_tag2(&$e, &$a, $t=1){
645// transform tag 645// transform tag
646if($e == 'big'){$e = 'span'; return 'font-size: larger;';} 646if($e == 'big'){$e = 'span'; return 'font-size: larger;';}
647if($e == 's' or $e == 'strike'){$e = 'span'; return 'text-decoration: line-through;';} 647if($e == 's' or $e == 'strike'){$e = 'span'; return 'text-decoration: line-through;';}
648if($e == 'tt'){$e = 'code'; return '';} 648if($e == 'tt'){$e = 'code'; return '';}
649if($e == 'center'){$e = 'div'; return 'text-align: center;';} 649if($e == 'center'){$e = 'div'; return 'text-align: center;';}
650static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%'); 650static $fs = array('0'=>'xx-small', '1'=>'xx-small', '2'=>'small', '3'=>'medium', '4'=>'large', '5'=>'x-large', '6'=>'xx-large', '7'=>'300%', '-1'=>'smaller', '-2'=>'60%', '+1'=>'larger', '+2'=>'150%', '+3'=>'200%', '+4'=>'300%');
651if($e == 'font'){ 651if($e == 'font'){
652 $a2 = ''; 652 $a2 = '';
653 while(preg_match('`(^|\s)(color|size)\s*=\s*(\'|")?(.+?)(\\3|\s|$)`i', $a, $m)){ 653 while(preg_match('`(^|\s)(color|size)\s*=\s*(\'|")?(.+?)(\\3|\s|$)`i', $a, $m)){
654 $a = str_replace($m[0], ' ', $a); 654 $a = str_replace($m[0], ' ', $a);
655 $a2 .= strtolower($m[2]) == 'color' ? (' color: '. str_replace(array('"', ';', ':'), '\'', trim($m[4])). ';') : (isset($fs[($m = trim($m[4]))]) ? (' font-size: '. $fs[$m]. ';') : ''); 655 $a2 .= strtolower($m[2]) == 'color' ? (' color: '. str_replace(array('"', ';', ':'), '\'', trim($m[4])). ';') : (isset($fs[($m = trim($m[4]))]) ? (' font-size: '. $fs[$m]. ';') : '');
656 } 656 }
657 while(preg_match('`(^|\s)face\s*=\s*(\'|")?([^=]+?)\\2`i', $a, $m) or preg_match('`(^|\s)face\s*=(\s*)(\S+)`i', $a, $m)){ 657 while(preg_match('`(^|\s)face\s*=\s*(\'|")?([^=]+?)\\2`i', $a, $m) or preg_match('`(^|\s)face\s*=(\s*)(\S+)`i', $a, $m)){
658 $a = str_replace($m[0], ' ', $a); 658 $a = str_replace($m[0], ' ', $a);
659 $a2 .= ' font-family: '. str_replace(array('"', ';', ':'), '\'', trim($m[3])). ';'; 659 $a2 .= ' font-family: '. str_replace(array('"', ';', ':'), '\'', trim($m[3])). ';';
660 } 660 }
661 $e = 'span'; return ltrim(str_replace('<', '', $a2)); 661 $e = 'span'; return ltrim(str_replace('<', '', $a2));
662} 662}
663if($e == 'acronym'){$e = 'abbr'; return '';} 663if($e == 'acronym'){$e = 'abbr'; return '';}
664if($e == 'dir'){$e = 'ul'; return '';} 664if($e == 'dir'){$e = 'ul'; return '';}
665if($t == 2){$e = 0; return 0;} 665if($t == 2){$e = 0; return 0;}
666return ''; 666return '';
667} 667}
668 668
669function hl_tidy($t, $w, $p){ 669function hl_tidy($t, $w, $p){
670// tidy/compact HTM 670// tidy/compact HTM
671if(strpos(' pre,script,textarea', "$p,")){return $t;} 671if(strpos(' pre,script,textarea', "$p,")){return $t;}
672if(!function_exists('hl_aux2')){function hl_aux2($m){ 672if(!function_exists('hl_aux2')){function hl_aux2($m){
673 return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", ' '), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4]; 673 return $m[1]. str_replace(array("<", ">", "\n", "\r", "\t", ' '), array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), $m[3]). $m[4];
674}} 674}}
675$t = preg_replace(array('`(<\w[^>]*(?<!/)>)\s+`', '`\s+`', '`(<\w[^>]*(?<!/)>) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), 'hl_aux2', $t)); 675$t = preg_replace(array('`(<\w[^>]*(?<!/)>)\s+`', '`\s+`', '`(<\w[^>]*(?<!/)>) `'), array(' $1', ' ', '$1'), preg_replace_callback(array('`(<(!\[CDATA\[))(.+?)(\]\]>)`sm', '`(<(!--))(.+?)(-->)`sm', '`(<(pre|script|textarea)[^>]*?>)(.+?)(</\2>)`sm'), 'hl_aux2', $t));
676if(($w = strtolower($w)) == -1){ 676if(($w = strtolower($w)) == -1){
677 return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); 677 return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t);
678} 678}
679$s = strpos(" $w", 't') ? "\t" : ' '; 679$s = strpos(" $w", 't') ? "\t" : ' ';
680$s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2)); 680$s = preg_match('`\d`', $w, $m) ? str_repeat($s, $m[0]) : str_repeat($s, ($s == "\t" ? 1 : 2));
681$N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0; 681$N = preg_match('`[ts]([1-9])`', $w, $m) ? $m[1] : 0;
682$a = array('br'=>1); 682$a = array('br'=>1);
683$b = array('button'=>1, 'command'=>1, 'input'=>1, 'option'=>1, 'param'=>1, 'track'=>1); 683$b = array('button'=>1, 'command'=>1, 'input'=>1, 'option'=>1, 'param'=>1, 'track'=>1);
684$c = array('audio'=>1, 'canvas'=>1, 'caption'=>1, 'dd'=>1, 'dt'=>1, 'figcaption'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'style'=>1, 'summary'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1, 'video'=>1); 684$c = array('audio'=>1, 'canvas'=>1, 'caption'=>1, 'dd'=>1, 'dt'=>1, 'figcaption'=>1, 'h1'=>1, 'h2'=>1, 'h3'=>1, 'h4'=>1, 'h5'=>1, 'h6'=>1, 'isindex'=>1, 'label'=>1, 'legend'=>1, 'li'=>1, 'object'=>1, 'p'=>1, 'pre'=>1, 'style'=>1, 'summary'=>1, 'td'=>1, 'textarea'=>1, 'th'=>1, 'video'=>1);
685$d = array('address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'datalist'=>1, 'details'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'iframe'=>1, 'main'=>1, 'map'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1); 685$d = array('address'=>1, 'article'=>1, 'aside'=>1, 'blockquote'=>1, 'center'=>1, 'colgroup'=>1, 'datalist'=>1, 'details'=>1, 'dir'=>1, 'div'=>1, 'dl'=>1, 'fieldset'=>1, 'figure'=>1, 'footer'=>1, 'form'=>1, 'header'=>1, 'hgroup'=>1, 'hr'=>1, 'iframe'=>1, 'main'=>1, 'map'=>1, 'menu'=>1, 'nav'=>1, 'noscript'=>1, 'ol'=>1, 'optgroup'=>1, 'rbc'=>1, 'rtc'=>1, 'ruby'=>1, 'script'=>1, 'section'=>1, 'select'=>1, 'table'=>1, 'tbody'=>1, 'tfoot'=>1, 'thead'=>1, 'tr'=>1, 'ul'=>1);
686$T = explode('<', $t); 686$T = explode('<', $t);
687$X = 1; 687$X = 1;
688while($X){ 688while($X){
689 $n = $N; 689 $n = $N;
690 $t = $T; 690 $t = $T;
691 ob_start(); 691 ob_start();
692 if(isset($d[$p])){echo str_repeat($s, ++$n);} 692 if(isset($d[$p])){echo str_repeat($s, ++$n);}
693 echo ltrim(array_shift($t)); 693 echo ltrim(array_shift($t));
694 for($i=-1, $j=count($t); ++$i<$j;){ 694 for($i=-1, $j=count($t); ++$i<$j;){
695 $r = ''; list($e, $r) = explode('>', $t[$i]); 695 $r = ''; list($e, $r) = explode('>', $t[$i]);
696 $x = $e[0] == '/' ? 0 : (substr($e, -1) == '/' ? 1 : ($e[0] != '!' ? 2 : -1)); 696 $x = $e[0] == '/' ? 0 : (substr($e, -1) == '/' ? 1 : ($e[0] != '!' ? 2 : -1));
697 $y = !$x ? ltrim($e, '/') : ($x > 0 ? substr($e, 0, strcspn($e, ' ')) : 0); 697 $y = !$x ? ltrim($e, '/') : ($x > 0 ? substr($e, 0, strcspn($e, ' ')) : 0);
698 $e = "<$e>"; 698 $e = "<$e>";
699 if(isset($d[$y])){ 699 if(isset($d[$y])){
700 if(!$x){ 700 if(!$x){
701 if($n){echo "\n", str_repeat($s, --$n), "$e\n", str_repeat($s, $n);} 701 if($n){echo "\n", str_repeat($s, --$n), "$e\n", str_repeat($s, $n);}
702 else{++$N; ob_end_clean(); continue 2;} 702 else{++$N; ob_end_clean(); continue 2;}
703 } 703 }
704 else{echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, ($x != 1 ? ++$n : $n));} 704 else{echo "\n", str_repeat($s, $n), "$e\n", str_repeat($s, ($x != 1 ? ++$n : $n));}
705 echo $r; continue; 705 echo $r; continue;
706 } 706 }
707 $f = "\n". str_repeat($s, $n); 707 $f = "\n". str_repeat($s, $n);
708 if(isset($c[$y])){ 708 if(isset($c[$y])){
709 if(!$x){echo $e, $f, $r;} 709 if(!$x){echo $e, $f, $r;}
710 else{echo $f, $e, $r;} 710 else{echo $f, $e, $r;}
711 }elseif(isset($b[$y])){echo $f, $e, $r; 711 }elseif(isset($b[$y])){echo $f, $e, $r;
712 }elseif(isset($a[$y])){echo $e, $f, $r; 712 }elseif(isset($a[$y])){echo $e, $f, $r;
713 }elseif(!$y){echo $f, $e, $f, $r; 713 }elseif(!$y){echo $f, $e, $f, $r;
714 }else{echo $e, $r;} 714 }else{echo $e, $r;}
715 } 715 }
716 $X = 0; 716 $X = 0;
717} 717}
718$t = str_replace(array("\n ", " \n"), "\n", preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents())); 718$t = str_replace(array("\n ", " \n"), "\n", preg_replace('`[\n]\s*?[\n]+`', "\n", ob_get_contents()));
719ob_end_clean(); 719ob_end_clean();
720if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){ 720if(($l = strpos(" $w", 'r') ? (strpos(" $w", 'n') ? "\r\n" : "\r") : 0)){
721 $t = str_replace("\n", $l, $t); 721 $t = str_replace("\n", $l, $t);
722} 722}
723return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t); 723return str_replace(array("\x01", "\x02", "\x03", "\x04", "\x05", "\x07"), array('<', '>', "\n", "\r", "\t", ' '), $t);
724} 724}
725 725
726function hl_version(){ 726function hl_version(){
727// version 727// version
728return '1.2.5'; 728return '1.2.5';
729} 729}
diff --git a/lib/htmlawed/htmLawedTest.php b/lib/htmlawed/htmLawedTest.php
index 6475bc8..dfbfb27 100755
--- a/lib/htmlawed/htmLawedTest.php
+++ b/lib/htmlawed/htmLawedTest.php
@@ -1,677 +1,677 @@
1<?php 1<?php
2 2
3/* 3/*
4htmLawedTest.php, 17 May 2017 4htmLawedTest.php, 17 May 2017
5To test htmLawed 5To test htmLawed
6Copyright Santosh Patnaik 6Copyright Santosh Patnaik
7Dual licensed with LGPL 3 and GPL 2+ 7Dual licensed with LGPL 3 and GPL 2+
8A PHP Labware internal utility - www.bioinformatics.org/phplabware/internal_utilities/htmLawed 8A PHP Labware internal utility - www.bioinformatics.org/phplabware/internal_utilities/htmLawed
9 9
10Test htmLawed; user provides text input; input and processed input are shown as highlighted code and rendered HTML; also shown are execution time and peak memory usage 10Test htmLawed; user provides text input; input and processed input are shown as highlighted code and rendered HTML; also shown are execution time and peak memory usage
11*/ 11*/
12 12
13// config 13// config
14$_errs = 0; // display PHP errors 14$_errs = 0; // display PHP errors
15$_limit = 12000; // input character limit 15$_limit = 12000; // input character limit
16$_hlimit = 2000; // input character limit for showing hexdumps 16$_hlimit = 2000; // input character limit for showing hexdumps
17$_hilite = 1; // 0 turns off slow Javascript-based code-highlighting, e.g., if $_limit is high 17$_hilite = 1; // 0 turns off slow Javascript-based code-highlighting, e.g., if $_limit is high
18$_w3c_validate = 1; // 1 to show buttons to send input/output to w3c validator 18$_w3c_validate = 1; // 1 to show buttons to send input/output to w3c validator
19$_sid = 'sid'; // session name; alphanum. 19$_sid = 'sid'; // session name; alphanum.
20$_slife = 30; // session life in min. 20$_slife = 30; // session life in min.
21 21
22// errors 22// errors
23error_reporting(E_ALL | (defined('E_STRICT') ? E_STRICT : 0)); 23error_reporting(E_ALL | (defined('E_STRICT') ? E_STRICT : 0));
24ini_set('display_errors', $_errs); 24ini_set('display_errors', $_errs);
25 25
26// session 26// session
27session_name($_sid); 27session_name($_sid);
28session_cache_limiter('private'); 28session_cache_limiter('private');
29session_cache_expire($_slife); 29session_cache_expire($_slife);
30ini_set('session.gc_maxlifetime', $_slife * 60); 30ini_set('session.gc_maxlifetime', $_slife * 60);
31ini_set('session.use_only_cookies', 1); 31ini_set('session.use_only_cookies', 1);
32ini_set('session.cookie_lifetime', 0); 32ini_set('session.cookie_lifetime', 0);
33session_start(); 33session_start();
34if(!isset($_SESSION['token'])){ 34if(!isset($_SESSION['token'])){
35 $_SESSION['token'] = md5(uniqid(rand(), 1)); 35 $_SESSION['token'] = md5(uniqid(rand(), 1));
36} 36}
37 37
38// slashes 38// slashes
39if(get_magic_quotes_gpc()){ 39if(get_magic_quotes_gpc()){
40 foreach($_POST as $k => $v){ 40 foreach($_POST as $k => $v){
41 $_POST[$k] = stripslashes($v); 41 $_POST[$k] = stripslashes($v);
42 } 42 }
43 ini_set('magic_quotes_gpc', 0); 43 ini_set('magic_quotes_gpc', 0);
44} 44}
45if(get_magic_quotes_runtime()){ 45if(get_magic_quotes_runtime()){
46 set_magic_quotes_runtime(0); 46 set_magic_quotes_runtime(0);
47} 47}
48 48
49$_POST['enc'] = (isset($_POST['enc']) and preg_match('`^[-\w]+$`', $_POST['enc'])) ? $_POST['enc'] : 'utf-8'; 49$_POST['enc'] = (isset($_POST['enc']) and preg_match('`^[-\w]+$`', $_POST['enc'])) ? $_POST['enc'] : 'utf-8';
50 50
51// token for anti-CSRF 51// token for anti-CSRF
52if(count($_POST)){ 52if(count($_POST)){
53 if((empty($_GET['pre']) and ((!empty($_POST['token']) and !empty($_SESSION['token']) and $_POST['token'] != $_SESSION['token']) or empty($_POST[$_sid]) or $_POST[$_sid] != session_id() or empty($_COOKIE[$_sid]) or $_COOKIE[$_sid] != session_id())) or ($_POST[$_sid] != session_id())){ 53 if((empty($_GET['pre']) and ((!empty($_POST['token']) and !empty($_SESSION['token']) and $_POST['token'] != $_SESSION['token']) or empty($_POST[$_sid]) or $_POST[$_sid] != session_id() or empty($_COOKIE[$_sid]) or $_COOKIE[$_sid] != session_id())) or ($_POST[$_sid] != session_id())){
54 $_POST = array('enc'=>'utf-8'); 54 $_POST = array('enc'=>'utf-8');
55 } 55 }
56} 56}
57if(empty($_GET['pre'])){ 57if(empty($_GET['pre'])){
58 $_SESSION['token'] = md5(uniqid(rand(), 1)); 58 $_SESSION['token'] = md5(uniqid(rand(), 1));
59 $token = $_SESSION['token']; 59 $token = $_SESSION['token'];
60 session_regenerate_id(1); 60 session_regenerate_id(1);
61} 61}
62 62
63// compress 63// compress
64if(function_exists('gzencode') && isset($_SERVER['HTTP_ACCEPT_ENCODING']) && preg_match('`gzip|deflate`i', $_SERVER['HTTP_ACCEPT_ENCODING']) && !ini_get('zlib.output_compression')){ 64if(function_exists('gzencode') && isset($_SERVER['HTTP_ACCEPT_ENCODING']) && preg_match('`gzip|deflate`i', $_SERVER['HTTP_ACCEPT_ENCODING']) && !ini_get('zlib.output_compression')){
65 ob_start('ob_gzhandler'); 65 ob_start('ob_gzhandler');
66} 66}
67 67
68// HTM for unprocessed 68// HTM for unprocessed
69if(isset($_POST['inputH'])){ 69if(isset($_POST['inputH'])){
70 echo '<html><head><title>htmLawed test: HTML view of unprocessed input</title></head><body style="margin:0; padding: 0;"><p style="background-color: black; color: white; padding: 2px;">&nbsp; Rendering of raw/unprocessed input without an HTML doctype or charset declaration &nbsp; &nbsp; <small><a style="color: white; text-decoration: none;" href="1" onclick="javascript:window.close(this); return false;">close window</a> | <a style="color: white; text-decoration: none;" href="htmLawedTest.php" onclick="javascript: window.open(\'htmLawedTest.php\', \'hlmain\'); window.close(this); return false;">htmLawed test page</a></small></p><div>', $_POST['inputH'], '</div></body></html>'; 70 echo '<html><head><title>htmLawed test: HTML view of unprocessed input</title></head><body style="margin:0; padding: 0;"><p style="background-color: black; color: white; padding: 2px;">&nbsp; Rendering of raw/unprocessed input without an HTML doctype or charset declaration &nbsp; &nbsp; <small><a style="color: white; text-decoration: none;" href="1" onclick="javascript:window.close(this); return false;">close window</a> | <a style="color: white; text-decoration: none;" href="htmLawedTest.php" onclick="javascript: window.open(\'htmLawedTest.php\', \'hlmain\'); window.close(this); return false;">htmLawed test page</a></small></p><div>', $_POST['inputH'], '</div></body></html>';
71 exit; 71 exit;
72} 72}
73 73
74// HTM for processed 74// HTM for processed
75if(isset($_POST['outputH'])){ 75if(isset($_POST['outputH'])){
76 echo '<html><head><title>htmLawed test: HTML view of unprocessed input</title></head><body style="margin:0; padding: 0;"><p style="background-color: black; color: white; padding: 2px;">&nbsp; Rendering of filtered/processed input without an HTML doctype or charset declaration &nbsp; &nbsp; <small><a style="color: white; text-decoration: none;" href="1" onclick="javascript:window.close(this); return false;">close window</a> | <a style="color: white; text-decoration: none;" href="htmLawedTest.php" onclick="javascript: window.open(\'htmLawedTest.php\', \'hlmain\'); window.close(this); return false;">htmLawed test page</a></small></p><div>', $_POST['outputH'], '</div></body></html>'; 76 echo '<html><head><title>htmLawed test: HTML view of unprocessed input</title></head><body style="margin:0; padding: 0;"><p style="background-color: black; color: white; padding: 2px;">&nbsp; Rendering of filtered/processed input without an HTML doctype or charset declaration &nbsp; &nbsp; <small><a style="color: white; text-decoration: none;" href="1" onclick="javascript:window.close(this); return false;">close window</a> | <a style="color: white; text-decoration: none;" href="htmLawedTest.php" onclick="javascript: window.open(\'htmLawedTest.php\', \'hlmain\'); window.close(this); return false;">htmLawed test page</a></small></p><div>', $_POST['outputH'], '</div></body></html>';
77 exit; 77 exit;
78} 78}
79 79
80// main 80// main
81$_POST['text'] = isset($_POST['text']) ? $_POST['text'] : 'text to process; < '. $_limit. ' characters'. ($_hlimit ? ' (for binary hexdump view, < '. $_hlimit. ')' : ''); 81$_POST['text'] = isset($_POST['text']) ? $_POST['text'] : 'text to process; < '. $_limit. ' characters'. ($_hlimit ? ' (for binary hexdump view, < '. $_hlimit. ')' : '');
82$do = (!empty($_POST[$_sid]) && isset($_POST['text'][0]) && !isset($_POST['text'][$_limit])) ? 1 : 0; 82$do = (!empty($_POST[$_sid]) && isset($_POST['text'][0]) && !isset($_POST['text'][$_limit])) ? 1 : 0;
83$limit_exceeded = isset($_POST['text'][$_limit]) ? 1 : 0; 83$limit_exceeded = isset($_POST['text'][$_limit]) ? 1 : 0;
84$pre_mem = memory_get_usage(); 84$pre_mem = memory_get_usage();
85$validation = (!empty($_POST[$_sid]) and isset($_POST['w3c_validate'][0])) ? 1 : 0; 85$validation = (!empty($_POST[$_sid]) and isset($_POST['w3c_validate'][0])) ? 1 : 0;
86include './htmLawed.php'; 86include './htmLawed.php';
87 87
88function format($t){ 88function format($t){
89 $t = "\n". str_replace(array("\t", "\r\n", "\r", '&', '<', '>', "\n"), array(' ', "\n", "\n", '&amp;', '&lt;', '&gt;', "<span class=\"newline\">&#172;</span><br />\n"), $t); 89 $t = "\n". str_replace(array("\t", "\r\n", "\r", '&', '<', '>', "\n"), array(' ', "\n", "\n", '&amp;', '&lt;', '&gt;', "<span class=\"newline\">&#172;</span><br />\n"), $t);
90 return str_replace(array('<br />', "\n ", ' '), array("\n<br />\n", "\n&nbsp;", ' &nbsp;'), $t); 90 return str_replace(array('<br />', "\n ", ' '), array("\n<br />\n", "\n&nbsp;", ' &nbsp;'), $t);
91} 91}
92 92
93function hexdump($d){ 93function hexdump($d){
94// Mainly by Aidan Lister <aidan@php.net>, Peter Waller <iridum@php.net> 94// Mainly by Aidan Lister <aidan@php.net>, Peter Waller <iridum@php.net>
95 $hexi = ''; 95 $hexi = '';
96 $ascii = ''; 96 $ascii = '';
97 ob_start(); 97 ob_start();
98 echo '<pre>'; 98 echo '<pre>';
99 $offset = 0; 99 $offset = 0;
100 $len = strlen($d); 100 $len = strlen($d);
101 for($i=$j=0; $i<$len; $i++) 101 for($i=$j=0; $i<$len; $i++)
102 { 102 {
103 // Convert to hexidecimal 103 // Convert to hexidecimal
104 $hexi .= sprintf("%02X ", ord($d[$i])); 104 $hexi .= sprintf("%02X ", ord($d[$i]));
105 // Replace non-viewable bytes with '.' 105 // Replace non-viewable bytes with '.'
106 if(ord($d[$i]) >= 32){ 106 if(ord($d[$i]) >= 32){
107 $ascii .= htmlspecialchars($d[$i]); 107 $ascii .= htmlspecialchars($d[$i]);
108 }else{ 108 }else{
109 $ascii .= '.'; 109 $ascii .= '.';
110 } 110 }
111 // Add extra column spacing 111 // Add extra column spacing
112 if($j == 7){ 112 if($j == 7){
113 $hexi .= ' '; 113 $hexi .= ' ';
114 $ascii .= ' '; 114 $ascii .= ' ';
115 } 115 }
116 // Add row 116 // Add row
117 if(++$j == 16 || $i == $len-1){ 117 if(++$j == 16 || $i == $len-1){
118 // Join the hexi / ascii output 118 // Join the hexi / ascii output
119 echo sprintf("%04X %-49s %s", $offset, $hexi, $ascii); 119 echo sprintf("%04X %-49s %s", $offset, $hexi, $ascii);
120 // Reset vars 120 // Reset vars
121 $hexi = $ascii = ''; 121 $hexi = $ascii = '';
122 $offset += 16; 122 $offset += 16;
123 $j = 0; 123 $j = 0;
124 // Add newline 124 // Add newline
125 if ($i !== $len-1){ 125 if ($i !== $len-1){
126 echo "\n"; 126 echo "\n";
127 } 127 }
128 } 128 }
129 } 129 }
130 echo '</pre>'; 130 echo '</pre>';
131 $o = ob_get_contents(); 131 $o = ob_get_contents();
132 ob_end_clean(); 132 ob_end_clean();
133 return $o; 133 return $o;
134} 134}
135?> 135?>
136 136
137<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" 137<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
138 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> 138 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
139<html lang="en" xml:lang="en"> 139<html lang="en" xml:lang="en">
140<head> 140<head>
141<meta http-equiv="content-type" content="text/html; charset=utf-8" /> 141<meta http-equiv="content-type" content="text/html; charset=utf-8" />
142<meta name="description" content="htmLawed <?php echo hl_version();?> test page" /> 142<meta name="description" content="htmLawed <?php echo hl_version();?> test page" />
143<style type="text/css"><!--/*--><![CDATA[/*><!--*/ 143<style type="text/css"><!--/*--><![CDATA[/*><!--*/
144a, a.resizer{text-decoration:none;} 144a, a.resizer{text-decoration:none;}
145a:hover, a.resizer:hover{color:red;} 145a:hover, a.resizer:hover{color:red;}
146a.resizer{color:green; float:right;} 146a.resizer{color:green; float:right;}
147body{background-color:#efefef;} 147body{background-color:#efefef;}
148body, button, div, html, input, p{font-size:13px; font-family:'Lucida grande', Verdana, Arial, Helvetica, sans-serif;} 148body, button, div, html, input, p{font-size:13px; font-family:'Lucida grande', Verdana, Arial, Helvetica, sans-serif;}
149button, input{font-size: 85%;} 149button, input{font-size: 85%;}
150div.help{border-top: 1px dotted gray; margin-top: 15px; padding-top: 15px; color:#999999;} 150div.help{border-top: 1px dotted gray; margin-top: 15px; padding-top: 15px; color:#999999;}
151#inputC, #inputD, #inputF, #inputR, #outputD, #outputF, #outputR, #settingF, #diff{display:block;} 151#inputC, #inputD, #inputF, #inputR, #outputD, #outputF, #outputR, #settingF, #diff{display:block;}
152#inputC, #settingF{background-color:white; border:1px gray solid; padding:3px;} 152#inputC, #settingF{background-color:white; border:1px gray solid; padding:3px;}
153#inputC li{margin: 0; padding: 0;} 153#inputC li{margin: 0; padding: 0;}
154#inputC ul{margin: 0; padding: 0; margin-left: 14px;} 154#inputC ul{margin: 0; padding: 0; margin-left: 14px;}
155#inputC input{margin: 0; margin-left: 2px; margin-right: 2px; padding: 1px; vertical-align: middle;} 155#inputC input{margin: 0; margin-left: 2px; margin-right: 2px; padding: 1px; vertical-align: middle;}
156#inputD{overflow:auto; background-color:#ffff99; border:1px #cc9966 solid; padding:3px;} 156#inputD{overflow:auto; background-color:#ffff99; border:1px #cc9966 solid; padding:3px;}
157#inputR{overflow:auto; background-color:#ffffcc; border:1px #ffcc99 solid; padding:3px;} 157#inputR{overflow:auto; background-color:#ffffcc; border:1px #ffcc99 solid; padding:3px;}
158#inputC, #settingF, #inputD, #inputR, #outputD, #outputR, #diff, textarea{font-size:100%; font-family:'Bitstream vera sans mono', 'courier new', 'courier', monospace;} 158#inputC, #settingF, #inputD, #inputR, #outputD, #outputR, #diff, textarea{font-size:100%; font-family:'Bitstream vera sans mono', 'courier new', 'courier', monospace;}
159#outputD{overflow:auto; background-color: #99ffcc; border:1px #66cc99 solid; padding:3px;} 159#outputD{overflow:auto; background-color: #99ffcc; border:1px #66cc99 solid; padding:3px;}
160#diff{overflow:auto; background-color: white; border:1px #dcdcdc solid; padding:3px;} 160#diff{overflow:auto; background-color: white; border:1px #dcdcdc solid; padding:3px;}
161#outputR{overflow:auto; background-color: #ccffcc; border:1px #99cc99 solid; padding:3px;} 161#outputR{overflow:auto; background-color: #ccffcc; border:1px #99cc99 solid; padding:3px;}
162span.cmtcdata{color: orange;} 162span.cmtcdata{color: orange;}
163span.ctag{color:red;} 163span.ctag{color:red;}
164span.ent{border-bottom:1px dotted #999999;} 164span.ent{border-bottom:1px dotted #999999;}
165span.etag{color:purple;} 165span.etag{color:purple;}
166span.help{color:#999999;} 166span.help{color:#999999;}
167span.newline{color:#dcdcdc;} 167span.newline{color:#dcdcdc;}
168span.notice{color:green;} 168span.notice{color:green;}
169span.otag{color:blue;} 169span.otag{color:blue;}
170#topmost{margin:auto; width:98%;} 170#topmost{margin:auto; width:98%;}
171/*]]>*/--></style> 171/*]]>*/--></style>
172<script type="text/javascript"><!--//--><![CDATA[//><!-- 172<script type="text/javascript"><!--//--><![CDATA[//><!--
173window.name = 'hlmain'; 173window.name = 'hlmain';
174function hl(i){ 174function hl(i){
175 <?php if(!$_hilite){echo 'return;'; }?> 175 <?php if(!$_hilite){echo 'return;'; }?>
176 var e = document.getElementById(i); 176 var e = document.getElementById(i);
177 if(!e){return;} 177 if(!e){return;}
178 run(e, '</[a-z1-6]+>', 'ctag'); 178 run(e, '</[a-z1-6]+>', 'ctag');
179 run(e, '<[a-z]+(?:[^>]*)/>', 'etag'); 179 run(e, '<[a-z]+(?:[^>]*)/>', 'etag');
180 run(e, '<[a-z1-6]+(?:[^>]*)>', 'otag'); 180 run(e, '<[a-z1-6]+(?:[^>]*)>', 'otag');
181 run(e, '&[#a-z0-9]+;', 'ent'); 181 run(e, '&[#a-z0-9]+;', 'ent');
182 run(e, '<!(?:(?:--(?:.|\n)*?--)|(?:\\[CDATA\\[(?:.|\n)*?\\]\\]))>', 'cmtcdata'); 182 run(e, '<!(?:(?:--(?:.|\n)*?--)|(?:\\[CDATA\\[(?:.|\n)*?\\]\\]))>', 'cmtcdata');
183} 183}
184function sndProc(){ 184function sndProc(){
185 var f = document.getElementById('testform'); 185 var f = document.getElementById('testform');
186 if(!f){return;} 186 if(!f){return;}
187 var e = document.createElement('input'); 187 var e = document.createElement('input');
188 e.type = 'hidden'; 188 e.type = 'hidden';
189 e.name = '<?php echo htmlspecialchars($_sid); ?>'; 189 e.name = '<?php echo htmlspecialchars($_sid); ?>';
190 e.id = '<?php echo htmlspecialchars($_sid); ?>'; 190 e.id = '<?php echo htmlspecialchars($_sid); ?>';
191 e.value = readCookie('<?php echo htmlspecialchars($_sid); ?>'); 191 e.value = readCookie('<?php echo htmlspecialchars($_sid); ?>');
192 f.appendChild(e); 192 f.appendChild(e);
193 f.submit(); 193 f.submit();
194} 194}
195function readCookie(n){ 195function readCookie(n){
196 var ne = n + '='; 196 var ne = n + '=';
197 var ca = document.cookie.split(';'); 197 var ca = document.cookie.split(';');
198 for(var i=0;i < ca.length;i++){ 198 for(var i=0;i < ca.length;i++){
199 var c = ca[i]; 199 var c = ca[i];
200 while(c.charAt(0)==' '){ 200 while(c.charAt(0)==' '){
201 c = c.substring(1,c.length); 201 c = c.substring(1,c.length);
202 } 202 }
203 if(c.indexOf(ne) == 0){ 203 if(c.indexOf(ne) == 0){
204 return c.substring(ne.length,c.length); 204 return c.substring(ne.length,c.length);
205 } 205 }
206 } 206 }
207 return null; 207 return null;
208} 208}
209function run(e, q, c){ 209function run(e, q, c){
210 var q = new RegExp(q); 210 var q = new RegExp(q);
211 if(e.firstChild == null){ 211 if(e.firstChild == null){
212 var m = q.exec(e.data); 212 var m = q.exec(e.data);
213 if(m){ 213 if(m){
214 var v = m[0]; 214 var v = m[0];
215 var k2 = e.splitText(m.index); 215 var k2 = e.splitText(m.index);
216 var k3 = k2.splitText(v.length); 216 var k3 = k2.splitText(v.length);
217 var s = e.ownerDocument.createElement('span'); 217 var s = e.ownerDocument.createElement('span');
218 e.parentNode.replaceChild(s, k2); 218 e.parentNode.replaceChild(s, k2);
219 s.className = c; s.appendChild(k2); 219 s.className = c; s.appendChild(k2);
220 } 220 }
221 } 221 }
222 for(var k = e.firstChild; k != null; k = k.nextSibling){ 222 for(var k = e.firstChild; k != null; k = k.nextSibling){
223 if(k.nodeType == 3){ 223 if(k.nodeType == 3){
224 var m = q.exec(k.data); 224 var m = q.exec(k.data);
225 if(m){ 225 if(m){
226 var v = m[0]; 226 var v = m[0];
227 var k2 = k.splitText(m.index); 227 var k2 = k.splitText(m.index);
228 var k3 = k2.splitText(v.length); 228 var k3 = k2.splitText(v.length);
229 var s = k.ownerDocument.createElement('span'); 229 var s = k.ownerDocument.createElement('span');
230 k.parentNode.replaceChild(s, k2); 230 k.parentNode.replaceChild(s, k2);
231 s.className = c; s.appendChild(k2); 231 s.className = c; s.appendChild(k2);
232 } 232 }
233 } 233 }
234 else if(c == 'ent' && k.nodeType == 1){ 234 else if(c == 'ent' && k.nodeType == 1){
235 var d = k.firstChild; 235 var d = k.firstChild;
236 if(d){ 236 if(d){
237 var m = q.exec(d.data); 237 var m = q.exec(d.data);
238 if(m){ 238 if(m){
239 var v = m[0]; 239 var v = m[0];
240 var d2 = d.splitText(m.index); 240 var d2 = d.splitText(m.index);
241 var d3 = d2.splitText(v.length); 241 var d3 = d2.splitText(v.length);
242 var s = d.ownerDocument.createElement('span'); 242 var s = d.ownerDocument.createElement('span');
243 d.parentNode.replaceChild(s, d2); 243 d.parentNode.replaceChild(s, d2);
244 s.className = c; s.appendChild(d2); 244 s.className = c; s.appendChild(d2);
245 } 245 }
246 } 246 }
247 } 247 }
248 } 248 }
249} 249}
250function toggle(i){ 250function toggle(i){
251 var e = document.getElementById(i); 251 var e = document.getElementById(i);
252 if(!e){return;} 252 if(!e){return;}
253 if(e.style){ 253 if(e.style){
254 var a = e.style.display; 254 var a = e.style.display;
255 if(a == 'block'){e.style.display = 'none'; return;} 255 if(a == 'block'){e.style.display = 'none'; return;}
256 if(a == 'none'){e.style.display = 'block';} 256 if(a == 'none'){e.style.display = 'block';}
257 else{e.style.display = 'none';} 257 else{e.style.display = 'none';}
258 return; 258 return;
259 } 259 }
260 var a = e.visibility; 260 var a = e.visibility;
261 if(a == 'hidden'){e.visibility = 'show'; return;} 261 if(a == 'hidden'){e.visibility = 'show'; return;}
262 if(a == 'show'){e.visibility = 'hidden';} 262 if(a == 'show'){e.visibility = 'hidden';}
263} 263}
264function sndProc2(){ 264function sndProc2(){
265 var i = document.getElementById('text2'); 265 var i = document.getElementById('text2');
266 if(!i){return;} 266 if(!i){return;}
267 i = i.value; 267 i = i.value;
268 var w = window.open('htmLawedTest.php?pre=1', 'hlposthtm'); 268 var w = window.open('htmLawedTest.php?pre=1', 'hlposthtm');
269 var f = document.createElement('form'); 269 var f = document.createElement('form');
270 f.enctype = 'application/x-www-form-urlencoded'; 270 f.enctype = 'application/x-www-form-urlencoded';
271 f.method = 'post'; 271 f.method = 'post';
272 f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>'; 272 f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>';
273 if(f.style){f.style.display = 'none';} 273 if(f.style){f.style.display = 'none';}
274 else{f.visibility = 'hidden';} 274 else{f.visibility = 'hidden';}
275 f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /></p>'; 275 f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /></p>';
276 f.action = 'htmLawedTest.php?pre=1'; 276 f.action = 'htmLawedTest.php?pre=1';
277 f.target = 'hlposthtm'; 277 f.target = 'hlposthtm';
278 f.method = 'post'; 278 f.method = 'post';
279 var t = document.createElement('textarea'); 279 var t = document.createElement('textarea');
280 t.name = 'outputH'; 280 t.name = 'outputH';
281 t.value = i; 281 t.value = i;
282 f.appendChild(t); 282 f.appendChild(t);
283 var b = document.getElementsByTagName('body')[0]; 283 var b = document.getElementsByTagName('body')[0];
284 b.appendChild(f); 284 b.appendChild(f);
285 f.submit(); 285 f.submit();
286 w.focus; 286 w.focus;
287} 287}
288function sndUnproc(){ 288function sndUnproc(){
289 var i = document.getElementById('text'); 289 var i = document.getElementById('text');
290 if(!i){return;} 290 if(!i){return;}
291 i = i.value; 291 i = i.value;
292 var w = window.open('htmLawedTest.php?pre=1', 'hlprehtm'); 292 var w = window.open('htmLawedTest.php?pre=1', 'hlprehtm');
293 var f = document.createElement('form'); 293 var f = document.createElement('form');
294 f.enctype = 'application/x-www-form-urlencoded'; 294 f.enctype = 'application/x-www-form-urlencoded';
295 f.method = 'post'; 295 f.method = 'post';
296 f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>'; 296 f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>';
297 if(f.style){f.style.display = 'none';} 297 if(f.style){f.style.display = 'none';}
298 else{f.visibility = 'hidden';} 298 else{f.visibility = 'hidden';}
299 f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /></p>'; 299 f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="token" id="token" value="<?php echo $token; ?>" /><input style="display:none;" type="hidden" name="<?php echo htmlspecialchars($_sid); ?>" id="<?php echo htmlspecialchars($_sid); ?>" value="' + readCookie('<?php echo htmlspecialchars($_sid); ?>') + '" /></p>';
300 f.action = 'htmLawedTest.php?pre=1'; 300 f.action = 'htmLawedTest.php?pre=1';
301 f.target = 'hlprehtm'; 301 f.target = 'hlprehtm';
302 f.method = 'post'; 302 f.method = 'post';
303 var t = document.createElement('textarea'); 303 var t = document.createElement('textarea');
304 t.name = 'inputH'; 304 t.name = 'inputH';
305 t.value = i; 305 t.value = i;
306 f.appendChild(t); 306 f.appendChild(t);
307 var b = document.getElementsByTagName('body')[0]; 307 var b = document.getElementsByTagName('body')[0];
308 b.appendChild(f); 308 b.appendChild(f);
309 f.submit(); 309 f.submit();
310 w.focus; 310 w.focus;
311} 311}
312function sndValidn(id, type){ 312function sndValidn(id, type){
313 var i = document.getElementById(id); 313 var i = document.getElementById(id);
314 if(!i){return;} 314 if(!i){return;}
315 i = i.value; 315 i = i.value;
316 var w = window.open('http://validator.w3.org/check', 'validate'+id+type); 316 var w = window.open('http://validator.w3.org/check', 'validate'+id+type);
317 var f = document.createElement('form'); 317 var f = document.createElement('form');
318 f.enctype = 'application/x-www-form-urlencoded'; 318 f.enctype = 'application/x-www-form-urlencoded';
319 f.method = 'post'; 319 f.method = 'post';
320 f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>'; 320 f.acceptCharset = '<?php echo htmlspecialchars($_POST['enc']); ?>';
321 if(f.style){f.style.display = 'none';} 321 if(f.style){f.style.display = 'none';}
322 else{f.visibility = 'hidden';} 322 else{f.visibility = 'hidden';}
323 f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="prefill" id="prefill" value="1" /><input style="display:none;" type="hidden" name="prefill_doctype" id="prefill_doctype" value="'+ type+ '" /><input style="display:none;" type="hidden" name="group" id="group" value="1" /><input type="hidden" name="ss" id="ss" value="1" /></p>'; 323 f.innerHTML = '<p style="display:none;"><input style="display:none;" type="hidden" name="prefill" id="prefill" value="1" /><input style="display:none;" type="hidden" name="prefill_doctype" id="prefill_doctype" value="'+ type+ '" /><input style="display:none;" type="hidden" name="group" id="group" value="1" /><input type="hidden" name="ss" id="ss" value="1" /></p>';
324 f.action = 'http://validator.w3.org/check'; 324 f.action = 'http://validator.w3.org/check';
325 f.target = 'validate'+id+type; 325 f.target = 'validate'+id+type;
326 var t = document.createElement('textarea'); 326 var t = document.createElement('textarea');
327 t.name = 'fragment'; 327 t.name = 'fragment';
328 t.value = i; 328 t.value = i;
329 f.appendChild(t); 329 f.appendChild(t);
330 var b = document.getElementsByTagName('body')[0]; 330 var b = document.getElementsByTagName('body')[0];
331 b.appendChild(f); 331 b.appendChild(f);
332 f.submit(); 332 f.submit();
333 w.focus; 333 w.focus;
334} 334}
335tRs = { 335tRs = {
336 formEl: null, 336 formEl: null,
337 resizeClass: 'textarea', 337 resizeClass: 'textarea',
338 adEv: function(t,ev,fn){ 338 adEv: function(t,ev,fn){
339 if(typeof document.addEventListener != 'undefined'){ 339 if(typeof document.addEventListener != 'undefined'){
340 t.addEventListener(ev,fn,false); 340 t.addEventListener(ev,fn,false);
341 }else{ 341 }else{
342 t.attachEvent('on' + ev, fn); 342 t.attachEvent('on' + ev, fn);
343 } 343 }
344 }, 344 },
345 rmEv: function(t,ev,fn){ 345 rmEv: function(t,ev,fn){
346 if(typeof document.removeEventListener != 'undefined'){ 346 if(typeof document.removeEventListener != 'undefined'){
347 t.removeEventListener(ev,fn,false); 347 t.removeEventListener(ev,fn,false);
348 }else 348 }else
349 { 349 {
350 t.detachEvent('on' + ev, fn); 350 t.detachEvent('on' + ev, fn);
351 } 351 }
352 }, 352 },
353 adBtn: function(){ 353 adBtn: function(){
354 var textareas = document.getElementsByTagName('textarea'); 354 var textareas = document.getElementsByTagName('textarea');
355 for(var i = 0; i < textareas.length; i++){ 355 for(var i = 0; i < textareas.length; i++){
356 var txtclass=textareas[i].className; 356 var txtclass=textareas[i].className;
357 if(txtclass.substring(0,tRs.resizeClass.length)==tRs.resizeClass || 357 if(txtclass.substring(0,tRs.resizeClass.length)==tRs.resizeClass ||
358 txtclass.substring(txtclass.length -tRs.resizeClass.length)==tRs.resizeClass){ 358 txtclass.substring(txtclass.length -tRs.resizeClass.length)==tRs.resizeClass){
359 var a = document.createElement('a'); 359 var a = document.createElement('a');
360 a.appendChild(document.createTextNode("\u2195")); 360 a.appendChild(document.createTextNode("\u2195"));
361 a.style.cursor = 'n-resize'; 361 a.style.cursor = 'n-resize';
362 a.className= 'resizer'; 362 a.className= 'resizer';
363 a.title = 'click-drag to resize textarea' 363 a.title = 'click-drag to resize textarea'
364 tRs.adEv(a, 'mousedown', tRs.initResize); 364 tRs.adEv(a, 'mousedown', tRs.initResize);
365 textareas[i].parentNode.appendChild(a); 365 textareas[i].parentNode.appendChild(a);
366 } 366 }
367 } 367 }
368 }, 368 },
369 initResize: function(event){ 369 initResize: function(event){
370 if(typeof event == 'undefined'){ 370 if(typeof event == 'undefined'){
371 event = window.event; 371 event = window.event;
372 } 372 }
373 if(event.srcElement){ 373 if(event.srcElement){
374 var target = event.srcElement.previousSibling; 374 var target = event.srcElement.previousSibling;
375 }else{ 375 }else{
376 var target = event.target.previousSibling; 376 var target = event.target.previousSibling;
377 } 377 }
378 if(target.nodeName.toLowerCase() == 'textarea' || (target.nodeName.toLowerCase() == 'input' && target.type == 'text')){ 378 if(target.nodeName.toLowerCase() == 'textarea' || (target.nodeName.toLowerCase() == 'input' && target.type == 'text')){
379 tRs.formEl = target; 379 tRs.formEl = target;
380 tRs.formEl.startHeight = tRs.formEl.clientHeight; 380 tRs.formEl.startHeight = tRs.formEl.clientHeight;
381 tRs.formEl.startY = event.clientY; 381 tRs.formEl.startY = event.clientY;
382 tRs.adEv(document, 'mousemove', tRs.resize); 382 tRs.adEv(document, 'mousemove', tRs.resize);
383 tRs.adEv(document, 'mouseup', tRs.stopResize); 383 tRs.adEv(document, 'mouseup', tRs.stopResize);
384 tRs.formEl.parentNode.style.cursor = 'n-resize'; 384 tRs.formEl.parentNode.style.cursor = 'n-resize';
385 tRs.formEl.style.cursor = 'n-resize'; 385 tRs.formEl.style.cursor = 'n-resize';
386 try{ 386 try{
387 event.preventDefault(); 387 event.preventDefault();
388 }catch(e){ 388 }catch(e){
389 } 389 }
390 } 390 }
391 }, 391 },
392 resize: function(event){ 392 resize: function(event){
393 if(typeof event == 'undefined'){ 393 if(typeof event == 'undefined'){
394 event = window.event; 394 event = window.event;
395 } 395 }
396 if(tRs.formEl.nodeName.toLowerCase() == 'textarea'){ 396 if(tRs.formEl.nodeName.toLowerCase() == 'textarea'){
397 tRs.formEl.style.height = event.clientY - tRs.formEl.startY + tRs.formEl.startHeight + 'px'; 397 tRs.formEl.style.height = event.clientY - tRs.formEl.startY + tRs.formEl.startHeight + 'px';
398 } 398 }
399 }, 399 },
400 stopResize: function(event){ 400 stopResize: function(event){
401 tRs.rmEv(document, 'mousedown', tRs.initResize); 401 tRs.rmEv(document, 'mousedown', tRs.initResize);
402 tRs.rmEv(document, 'mousemove', tRs.resize); 402 tRs.rmEv(document, 'mousemove', tRs.resize);
403 tRs.formEl.style.cursor = 'text'; 403 tRs.formEl.style.cursor = 'text';
404 tRs.formEl.parentNode.style.cursor = 'auto'; 404 tRs.formEl.parentNode.style.cursor = 'auto';
405 return false; 405 return false;
406 } 406 }
407}; 407};
408tRs.adEv(window, 'load', tRs.adBtn); 408tRs.adEv(window, 'load', tRs.adBtn);
409// Diff Match and Patch javascript code by Neil Fraser; Apache license 2.0; http://code.google.com/p/google-diff-match-patch/ 409// Diff Match and Patch javascript code by Neil Fraser; Apache license 2.0; http://code.google.com/p/google-diff-match-patch/
410(function(){function diff_match_patch(){this.Diff_Timeout=1;this.Diff_EditCost=4;this.Match_Threshold=0.5;this.Match_Distance=1E3;this.Patch_DeleteThreshold=0.5;this.Patch_Margin=4;this.Match_MaxBits=32} 410(function(){function diff_match_patch(){this.Diff_Timeout=1;this.Diff_EditCost=4;this.Match_Threshold=0.5;this.Match_Distance=1E3;this.Patch_DeleteThreshold=0.5;this.Patch_Margin=4;this.Match_MaxBits=32}
411diff_match_patch.prototype.diff_main=function(a,b,c,d){"undefined"==typeof d&&(d=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[[0,a]]:[];"undefined"==typeof c&&(c=!0);var e=c,f=this.diff_commonPrefix(a,b),c=a.substring(0,f),a=a.substring(f),b=b.substring(f),f=this.diff_commonSuffix(a,b),g=a.substring(a.length-f),a=a.substring(0,a.length-f),b=b.substring(0,b.length-f),a=this.diff_compute_(a, 411diff_match_patch.prototype.diff_main=function(a,b,c,d){"undefined"==typeof d&&(d=0>=this.Diff_Timeout?Number.MAX_VALUE:(new Date).getTime()+1E3*this.Diff_Timeout);if(null==a||null==b)throw Error("Null input. (diff_main)");if(a==b)return a?[[0,a]]:[];"undefined"==typeof c&&(c=!0);var e=c,f=this.diff_commonPrefix(a,b),c=a.substring(0,f),a=a.substring(f),b=b.substring(f),f=this.diff_commonSuffix(a,b),g=a.substring(a.length-f),a=a.substring(0,a.length-f),b=b.substring(0,b.length-f),a=this.diff_compute_(a,
412b,e,d);c&&a.unshift([0,c]);g&&a.push([0,g]);this.diff_cleanupMerge(a);return a}; 412b,e,d);c&&a.unshift([0,c]);g&&a.push([0,g]);this.diff_cleanupMerge(a);return a};
413diff_match_patch.prototype.diff_compute_=function(a,b,c,d){if(!a)return[[1,b]];if(!b)return[[-1,a]];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);if(-1!=g)return c=[[1,e.substring(0,g)],[0,f],[1,e.substring(g+f.length)]],a.length>b.length&&(c[0][0]=c[2][0]=-1),c;if(1==f.length)return[[-1,a],[1,b]];return(e=this.diff_halfMatch_(a,b))?(f=e[0],a=e[1],g=e[2],b=e[3],e=e[4],f=this.diff_main(f,g,c,d),c=this.diff_main(a,b,c,d),f.concat([[0,e]],c)):c&&100<a.length&&100<b.length?this.diff_lineMode_(a, 413diff_match_patch.prototype.diff_compute_=function(a,b,c,d){if(!a)return[[1,b]];if(!b)return[[-1,a]];var e=a.length>b.length?a:b,f=a.length>b.length?b:a,g=e.indexOf(f);if(-1!=g)return c=[[1,e.substring(0,g)],[0,f],[1,e.substring(g+f.length)]],a.length>b.length&&(c[0][0]=c[2][0]=-1),c;if(1==f.length)return[[-1,a],[1,b]];return(e=this.diff_halfMatch_(a,b))?(f=e[0],a=e[1],g=e[2],b=e[3],e=e[4],f=this.diff_main(f,g,c,d),c=this.diff_main(a,b,c,d),f.concat([[0,e]],c)):c&&100<a.length&&100<b.length?this.diff_lineMode_(a,
414b,d):this.diff_bisect_(a,b,d)}; 414b,d):this.diff_bisect_(a,b,d)};
415diff_match_patch.prototype.diff_lineMode_=function(a,b,c){var d=this.diff_linesToChars_(a,b),a=d.chars1,b=d.chars2,d=d.lineArray,a=this.diff_main(a,b,!1,c);this.diff_charsToLines_(a,d);this.diff_cleanupSemantic(a);a.push([0,""]);for(var e=d=b=0,f="",g="";b<a.length;){switch(a[b][0]){case 1:e++;g+=a[b][1];break;case -1:d++;f+=a[b][1];break;case 0:if(1<=d&&1<=e){a.splice(b-d-e,d+e);b=b-d-e;d=this.diff_main(f,g,!1,c);for(e=d.length-1;0<=e;e--)a.splice(b,0,d[e]);b+=d.length}d=e=0;g=f=""}b++}a.pop();return a}; 415diff_match_patch.prototype.diff_lineMode_=function(a,b,c){var d=this.diff_linesToChars_(a,b),a=d.chars1,b=d.chars2,d=d.lineArray,a=this.diff_main(a,b,!1,c);this.diff_charsToLines_(a,d);this.diff_cleanupSemantic(a);a.push([0,""]);for(var e=d=b=0,f="",g="";b<a.length;){switch(a[b][0]){case 1:e++;g+=a[b][1];break;case -1:d++;f+=a[b][1];break;case 0:if(1<=d&&1<=e){a.splice(b-d-e,d+e);b=b-d-e;d=this.diff_main(f,g,!1,c);for(e=d.length-1;0<=e;e--)a.splice(b,0,d[e]);b+=d.length}d=e=0;g=f=""}b++}a.pop();return a};
416diff_match_patch.prototype.diff_bisect_=function(a,b,c){for(var d=a.length,e=b.length,f=Math.ceil((d+e)/2),g=f,h=2*f,j=Array(h),i=Array(h),k=0;k<h;k++)j[k]=-1,i[k]=-1;j[g+1]=0;i[g+1]=0;for(var k=d-e,p=0!=k%2,q=0,s=0,o=0,v=0,u=0;u<f&&!((new Date).getTime()>c);u++){for(var n=-u+q;n<=u-s;n+=2){var l=g+n,m;m=n==-u||n!=u&&j[l-1]<j[l+1]?j[l+1]:j[l-1]+1;for(var r=m-n;m<d&&r<e&&a.charAt(m)==b.charAt(r);)m++,r++;j[l]=m;if(m>d)s+=2;else if(r>e)q+=2;else if(p&&(l=g+k-n,0<=l&&l<h&&-1!=i[l])){var t=d-i[l];if(m>= 416diff_match_patch.prototype.diff_bisect_=function(a,b,c){for(var d=a.length,e=b.length,f=Math.ceil((d+e)/2),g=f,h=2*f,j=Array(h),i=Array(h),k=0;k<h;k++)j[k]=-1,i[k]=-1;j[g+1]=0;i[g+1]=0;for(var k=d-e,p=0!=k%2,q=0,s=0,o=0,v=0,u=0;u<f&&!((new Date).getTime()>c);u++){for(var n=-u+q;n<=u-s;n+=2){var l=g+n,m;m=n==-u||n!=u&&j[l-1]<j[l+1]?j[l+1]:j[l-1]+1;for(var r=m-n;m<d&&r<e&&a.charAt(m)==b.charAt(r);)m++,r++;j[l]=m;if(m>d)s+=2;else if(r>e)q+=2;else if(p&&(l=g+k-n,0<=l&&l<h&&-1!=i[l])){var t=d-i[l];if(m>=
417t)return this.diff_bisectSplit_(a,b,m,r,c)}}for(n=-u+o;n<=u-v;n+=2){l=g+n;t=n==-u||n!=u&&i[l-1]<i[l+1]?i[l+1]:i[l-1]+1;for(m=t-n;t<d&&m<e&&a.charAt(d-t-1)==b.charAt(e-m-1);)t++,m++;i[l]=t;if(t>d)v+=2;else if(m>e)o+=2;else if(!p&&(l=g+k-n,0<=l&&l<h&&-1!=j[l]&&(m=j[l],r=g+m-l,t=d-t,m>=t)))return this.diff_bisectSplit_(a,b,m,r,c)}}return[[-1,a],[1,b]]}; 417t)return this.diff_bisectSplit_(a,b,m,r,c)}}for(n=-u+o;n<=u-v;n+=2){l=g+n;t=n==-u||n!=u&&i[l-1]<i[l+1]?i[l+1]:i[l-1]+1;for(m=t-n;t<d&&m<e&&a.charAt(d-t-1)==b.charAt(e-m-1);)t++,m++;i[l]=t;if(t>d)v+=2;else if(m>e)o+=2;else if(!p&&(l=g+k-n,0<=l&&l<h&&-1!=j[l]&&(m=j[l],r=g+m-l,t=d-t,m>=t)))return this.diff_bisectSplit_(a,b,m,r,c)}}return[[-1,a],[1,b]]};
418diff_match_patch.prototype.diff_bisectSplit_=function(a,b,c,d,e){var f=a.substring(0,c),g=b.substring(0,d),a=a.substring(c),b=b.substring(d),f=this.diff_main(f,g,!1,e),e=this.diff_main(a,b,!1,e);return f.concat(e)}; 418diff_match_patch.prototype.diff_bisectSplit_=function(a,b,c,d,e){var f=a.substring(0,c),g=b.substring(0,d),a=a.substring(c),b=b.substring(d),f=this.diff_main(f,g,!1,e),e=this.diff_main(a,b,!1,e);return f.concat(e)};
419diff_match_patch.prototype.diff_linesToChars_=function(a,b){function c(a){for(var b="",c=0,f=-1,g=d.length;f<a.length-1;){f=a.indexOf("\n",c);-1==f&&(f=a.length-1);var q=a.substring(c,f+1),c=f+1;(e.hasOwnProperty?e.hasOwnProperty(q):void 0!==e[q])?b+=String.fromCharCode(e[q]):(b+=String.fromCharCode(g),e[q]=g,d[g++]=q)}return b}var d=[],e={};d[0]="";var f=c(a),g=c(b);return{chars1:f,chars2:g,lineArray:d}}; 419diff_match_patch.prototype.diff_linesToChars_=function(a,b){function c(a){for(var b="",c=0,f=-1,g=d.length;f<a.length-1;){f=a.indexOf("\n",c);-1==f&&(f=a.length-1);var q=a.substring(c,f+1),c=f+1;(e.hasOwnProperty?e.hasOwnProperty(q):void 0!==e[q])?b+=String.fromCharCode(e[q]):(b+=String.fromCharCode(g),e[q]=g,d[g++]=q)}return b}var d=[],e={};d[0]="";var f=c(a),g=c(b);return{chars1:f,chars2:g,lineArray:d}};
420diff_match_patch.prototype.diff_charsToLines_=function(a,b){for(var c=0;c<a.length;c++){for(var d=a[c][1],e=[],f=0;f<d.length;f++)e[f]=b[d.charCodeAt(f)];a[c][1]=e.join("")}};diff_match_patch.prototype.diff_commonPrefix=function(a,b){if(!a||!b||a.charAt(0)!=b.charAt(0))return 0;for(var c=0,d=Math.min(a.length,b.length),e=d,f=0;c<e;)a.substring(f,e)==b.substring(f,e)?f=c=e:d=e,e=Math.floor((d-c)/2+c);return e}; 420diff_match_patch.prototype.diff_charsToLines_=function(a,b){for(var c=0;c<a.length;c++){for(var d=a[c][1],e=[],f=0;f<d.length;f++)e[f]=b[d.charCodeAt(f)];a[c][1]=e.join("")}};diff_match_patch.prototype.diff_commonPrefix=function(a,b){if(!a||!b||a.charAt(0)!=b.charAt(0))return 0;for(var c=0,d=Math.min(a.length,b.length),e=d,f=0;c<e;)a.substring(f,e)==b.substring(f,e)?f=c=e:d=e,e=Math.floor((d-c)/2+c);return e};
421diff_match_patch.prototype.diff_commonSuffix=function(a,b){if(!a||!b||a.charAt(a.length-1)!=b.charAt(b.length-1))return 0;for(var c=0,d=Math.min(a.length,b.length),e=d,f=0;c<e;)a.substring(a.length-e,a.length-f)==b.substring(b.length-e,b.length-f)?f=c=e:d=e,e=Math.floor((d-c)/2+c);return e}; 421diff_match_patch.prototype.diff_commonSuffix=function(a,b){if(!a||!b||a.charAt(a.length-1)!=b.charAt(b.length-1))return 0;for(var c=0,d=Math.min(a.length,b.length),e=d,f=0;c<e;)a.substring(a.length-e,a.length-f)==b.substring(b.length-e,b.length-f)?f=c=e:d=e,e=Math.floor((d-c)/2+c);return e};
422diff_match_patch.prototype.diff_commonOverlap_=function(a,b){var c=a.length,d=b.length;if(0==c||0==d)return 0;c>d?a=a.substring(c-d):c<d&&(b=b.substring(0,c));c=Math.min(c,d);if(a==b)return c;for(var d=0,e=1;;){var f=a.substring(c-e),f=b.indexOf(f);if(-1==f)return d;e+=f;if(0==f||a.substring(c-e)==b.substring(0,e))d=e,e++}}; 422diff_match_patch.prototype.diff_commonOverlap_=function(a,b){var c=a.length,d=b.length;if(0==c||0==d)return 0;c>d?a=a.substring(c-d):c<d&&(b=b.substring(0,c));c=Math.min(c,d);if(a==b)return c;for(var d=0,e=1;;){var f=a.substring(c-e),f=b.indexOf(f);if(-1==f)return d;e+=f;if(0==f||a.substring(c-e)==b.substring(0,e))d=e,e++}};
423diff_match_patch.prototype.diff_halfMatch_=function(a,b){function c(a,b,c){for(var d=a.substring(c,c+Math.floor(a.length/4)),e=-1,g="",h,j,n,l;-1!=(e=b.indexOf(d,e+1));){var m=f.diff_commonPrefix(a.substring(c),b.substring(e)),r=f.diff_commonSuffix(a.substring(0,c),b.substring(0,e));g.length<r+m&&(g=b.substring(e-r,e)+b.substring(e,e+m),h=a.substring(0,c-r),j=a.substring(c+m),n=b.substring(0,e-r),l=b.substring(e+m))}return 2*g.length>=a.length?[h,j,n,l,g]:null}if(0>=this.Diff_Timeout)return null; 423diff_match_patch.prototype.diff_halfMatch_=function(a,b){function c(a,b,c){for(var d=a.substring(c,c+Math.floor(a.length/4)),e=-1,g="",h,j,n,l;-1!=(e=b.indexOf(d,e+1));){var m=f.diff_commonPrefix(a.substring(c),b.substring(e)),r=f.diff_commonSuffix(a.substring(0,c),b.substring(0,e));g.length<r+m&&(g=b.substring(e-r,e)+b.substring(e,e+m),h=a.substring(0,c-r),j=a.substring(c+m),n=b.substring(0,e-r),l=b.substring(e+m))}return 2*g.length>=a.length?[h,j,n,l,g]:null}if(0>=this.Diff_Timeout)return null;
424var d=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>d.length||2*e.length<d.length)return null;var f=this,g=c(d,e,Math.ceil(d.length/4)),d=c(d,e,Math.ceil(d.length/2)),h;if(!g&&!d)return null;h=d?g?g[4].length>d[4].length?g:d:d:g;var j;a.length>b.length?(g=h[0],d=h[1],e=h[2],j=h[3]):(e=h[0],j=h[1],g=h[2],d=h[3]);h=h[4];return[g,d,e,j,h]}; 424var d=a.length>b.length?a:b,e=a.length>b.length?b:a;if(4>d.length||2*e.length<d.length)return null;var f=this,g=c(d,e,Math.ceil(d.length/4)),d=c(d,e,Math.ceil(d.length/2)),h;if(!g&&!d)return null;h=d?g?g[4].length>d[4].length?g:d:d:g;var j;a.length>b.length?(g=h[0],d=h[1],e=h[2],j=h[3]):(e=h[0],j=h[1],g=h[2],d=h[3]);h=h[4];return[g,d,e,j,h]};
425diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=0,h=0,j=0,i=0;f<a.length;)0==a[f][0]?(c[d++]=f,g=j,h=i,i=j=0,e=a[f][1]):(1==a[f][0]?j+=a[f][1].length:i+=a[f][1].length,e&&e.length<=Math.max(g,h)&&e.length<=Math.max(j,i)&&(a.splice(c[d-1],0,[-1,e]),a[c[d-1]+1][0]=1,d--,d--,f=0<d?c[d-1]:-1,i=j=h=g=0,e=null,b=!0)),f++;b&&this.diff_cleanupMerge(a);this.diff_cleanupSemanticLossless(a);for(f=1;f<a.length;){if(-1==a[f-1][0]&&1==a[f][0]){b=a[f-1][1];c=a[f][1]; 425diff_match_patch.prototype.diff_cleanupSemantic=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=0,h=0,j=0,i=0;f<a.length;)0==a[f][0]?(c[d++]=f,g=j,h=i,i=j=0,e=a[f][1]):(1==a[f][0]?j+=a[f][1].length:i+=a[f][1].length,e&&e.length<=Math.max(g,h)&&e.length<=Math.max(j,i)&&(a.splice(c[d-1],0,[-1,e]),a[c[d-1]+1][0]=1,d--,d--,f=0<d?c[d-1]:-1,i=j=h=g=0,e=null,b=!0)),f++;b&&this.diff_cleanupMerge(a);this.diff_cleanupSemanticLossless(a);for(f=1;f<a.length;){if(-1==a[f-1][0]&&1==a[f][0]){b=a[f-1][1];c=a[f][1];
426d=this.diff_commonOverlap_(b,c);e=this.diff_commonOverlap_(c,b);if(d>=e){if(d>=b.length/2||d>=c.length/2)a.splice(f,0,[0,c.substring(0,d)]),a[f-1][1]=b.substring(0,b.length-d),a[f+1][1]=c.substring(d),f++}else if(e>=b.length/2||e>=c.length/2)a.splice(f,0,[0,b.substring(0,e)]),a[f-1][0]=1,a[f-1][1]=c.substring(0,c.length-e),a[f+1][0]=-1,a[f+1][1]=b.substring(e),f++;f++}f++}}; 426d=this.diff_commonOverlap_(b,c);e=this.diff_commonOverlap_(c,b);if(d>=e){if(d>=b.length/2||d>=c.length/2)a.splice(f,0,[0,c.substring(0,d)]),a[f-1][1]=b.substring(0,b.length-d),a[f+1][1]=c.substring(d),f++}else if(e>=b.length/2||e>=c.length/2)a.splice(f,0,[0,b.substring(0,e)]),a[f-1][0]=1,a[f-1][1]=c.substring(0,c.length-e),a[f+1][0]=-1,a[f+1][1]=b.substring(e),f++;f++}f++}};
427diff_match_patch.prototype.diff_cleanupSemanticLossless=function(a){function b(a,b){if(!a||!b)return 6;var c=a.charAt(a.length-1),d=b.charAt(0),e=c.match(diff_match_patch.nonAlphaNumericRegex_),f=d.match(diff_match_patch.nonAlphaNumericRegex_),g=e&&c.match(diff_match_patch.whitespaceRegex_),h=f&&d.match(diff_match_patch.whitespaceRegex_),c=g&&c.match(diff_match_patch.linebreakRegex_),d=h&&d.match(diff_match_patch.linebreakRegex_),i=c&&a.match(diff_match_patch.blanklineEndRegex_),j=d&&b.match(diff_match_patch.blanklineStartRegex_); 427diff_match_patch.prototype.diff_cleanupSemanticLossless=function(a){function b(a,b){if(!a||!b)return 6;var c=a.charAt(a.length-1),d=b.charAt(0),e=c.match(diff_match_patch.nonAlphaNumericRegex_),f=d.match(diff_match_patch.nonAlphaNumericRegex_),g=e&&c.match(diff_match_patch.whitespaceRegex_),h=f&&d.match(diff_match_patch.whitespaceRegex_),c=g&&c.match(diff_match_patch.linebreakRegex_),d=h&&d.match(diff_match_patch.linebreakRegex_),i=c&&a.match(diff_match_patch.blanklineEndRegex_),j=d&&b.match(diff_match_patch.blanklineStartRegex_);
428return i||j?5:c||d?4:e&&!g&&h?3:g||h?2:e||f?1:0}for(var c=1;c<a.length-1;){if(0==a[c-1][0]&&0==a[c+1][0]){var d=a[c-1][1],e=a[c][1],f=a[c+1][1],g=this.diff_commonSuffix(d,e);if(g)var h=e.substring(e.length-g),d=d.substring(0,d.length-g),e=h+e.substring(0,e.length-g),f=h+f;for(var g=d,h=e,j=f,i=b(d,e)+b(e,f);e.charAt(0)===f.charAt(0);){var d=d+e.charAt(0),e=e.substring(1)+f.charAt(0),f=f.substring(1),k=b(d,e)+b(e,f);k>=i&&(i=k,g=d,h=e,j=f)}a[c-1][1]!=g&&(g?a[c-1][1]=g:(a.splice(c-1,1),c--),a[c][1]= 428return i||j?5:c||d?4:e&&!g&&h?3:g||h?2:e||f?1:0}for(var c=1;c<a.length-1;){if(0==a[c-1][0]&&0==a[c+1][0]){var d=a[c-1][1],e=a[c][1],f=a[c+1][1],g=this.diff_commonSuffix(d,e);if(g)var h=e.substring(e.length-g),d=d.substring(0,d.length-g),e=h+e.substring(0,e.length-g),f=h+f;for(var g=d,h=e,j=f,i=b(d,e)+b(e,f);e.charAt(0)===f.charAt(0);){var d=d+e.charAt(0),e=e.substring(1)+f.charAt(0),f=f.substring(1),k=b(d,e)+b(e,f);k>=i&&(i=k,g=d,h=e,j=f)}a[c-1][1]!=g&&(g?a[c-1][1]=g:(a.splice(c-1,1),c--),a[c][1]=
429h,j?a[c+1][1]=j:(a.splice(c+1,1),c--))}c++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/; 429h,j?a[c+1][1]=j:(a.splice(c+1,1),c--))}c++}};diff_match_patch.nonAlphaNumericRegex_=/[^a-zA-Z0-9]/;diff_match_patch.whitespaceRegex_=/\s/;diff_match_patch.linebreakRegex_=/[\r\n]/;diff_match_patch.blanklineEndRegex_=/\n\r?\n$/;diff_match_patch.blanklineStartRegex_=/^\r?\n\r?\n/;
430diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=!1,h=!1,j=!1,i=!1;f<a.length;){if(0==a[f][0])a[f][1].length<this.Diff_EditCost&&(j||i)?(c[d++]=f,g=j,h=i,e=a[f][1]):(d=0,e=null),j=i=!1;else if(-1==a[f][0]?i=!0:j=!0,e&&(g&&h&&j&&i||e.length<this.Diff_EditCost/2&&3==g+h+j+i))a.splice(c[d-1],0,[-1,e]),a[c[d-1]+1][0]=1,d--,e=null,g&&h?(j=i=!0,d=0):(d--,f=0<d?c[d-1]:-1,j=i=!1),b=!0;f++}b&&this.diff_cleanupMerge(a)}; 430diff_match_patch.prototype.diff_cleanupEfficiency=function(a){for(var b=!1,c=[],d=0,e=null,f=0,g=!1,h=!1,j=!1,i=!1;f<a.length;){if(0==a[f][0])a[f][1].length<this.Diff_EditCost&&(j||i)?(c[d++]=f,g=j,h=i,e=a[f][1]):(d=0,e=null),j=i=!1;else if(-1==a[f][0]?i=!0:j=!0,e&&(g&&h&&j&&i||e.length<this.Diff_EditCost/2&&3==g+h+j+i))a.splice(c[d-1],0,[-1,e]),a[c[d-1]+1][0]=1,d--,e=null,g&&h?(j=i=!0,d=0):(d--,f=0<d?c[d-1]:-1,j=i=!1),b=!0;f++}b&&this.diff_cleanupMerge(a)};
431diff_match_patch.prototype.diff_cleanupMerge=function(a){a.push([0,""]);for(var b=0,c=0,d=0,e="",f="",g;b<a.length;)switch(a[b][0]){case 1:d++;f+=a[b][1];b++;break;case -1:c++;e+=a[b][1];b++;break;case 0:1<c+d?(0!==c&&0!==d&&(g=this.diff_commonPrefix(f,e),0!==g&&(0<b-c-d&&0==a[b-c-d-1][0]?a[b-c-d-1][1]+=f.substring(0,g):(a.splice(0,0,[0,f.substring(0,g)]),b++),f=f.substring(g),e=e.substring(g)),g=this.diff_commonSuffix(f,e),0!==g&&(a[b][1]=f.substring(f.length-g)+a[b][1],f=f.substring(0,f.length- 431diff_match_patch.prototype.diff_cleanupMerge=function(a){a.push([0,""]);for(var b=0,c=0,d=0,e="",f="",g;b<a.length;)switch(a[b][0]){case 1:d++;f+=a[b][1];b++;break;case -1:c++;e+=a[b][1];b++;break;case 0:1<c+d?(0!==c&&0!==d&&(g=this.diff_commonPrefix(f,e),0!==g&&(0<b-c-d&&0==a[b-c-d-1][0]?a[b-c-d-1][1]+=f.substring(0,g):(a.splice(0,0,[0,f.substring(0,g)]),b++),f=f.substring(g),e=e.substring(g)),g=this.diff_commonSuffix(f,e),0!==g&&(a[b][1]=f.substring(f.length-g)+a[b][1],f=f.substring(0,f.length-
432g),e=e.substring(0,e.length-g))),0===c?a.splice(b-d,c+d,[1,f]):0===d?a.splice(b-c,c+d,[-1,e]):a.splice(b-c-d,c+d,[-1,e],[1,f]),b=b-c-d+(c?1:0)+(d?1:0)+1):0!==b&&0==a[b-1][0]?(a[b-1][1]+=a[b][1],a.splice(b,1)):b++,c=d=0,f=e=""}""===a[a.length-1][1]&&a.pop();c=!1;for(b=1;b<a.length-1;)0==a[b-1][0]&&0==a[b+1][0]&&(a[b][1].substring(a[b][1].length-a[b-1][1].length)==a[b-1][1]?(a[b][1]=a[b-1][1]+a[b][1].substring(0,a[b][1].length-a[b-1][1].length),a[b+1][1]=a[b-1][1]+a[b+1][1],a.splice(b-1,1),c=!0):a[b][1].substring(0, 432g),e=e.substring(0,e.length-g))),0===c?a.splice(b-d,c+d,[1,f]):0===d?a.splice(b-c,c+d,[-1,e]):a.splice(b-c-d,c+d,[-1,e],[1,f]),b=b-c-d+(c?1:0)+(d?1:0)+1):0!==b&&0==a[b-1][0]?(a[b-1][1]+=a[b][1],a.splice(b,1)):b++,c=d=0,f=e=""}""===a[a.length-1][1]&&a.pop();c=!1;for(b=1;b<a.length-1;)0==a[b-1][0]&&0==a[b+1][0]&&(a[b][1].substring(a[b][1].length-a[b-1][1].length)==a[b-1][1]?(a[b][1]=a[b-1][1]+a[b][1].substring(0,a[b][1].length-a[b-1][1].length),a[b+1][1]=a[b-1][1]+a[b+1][1],a.splice(b-1,1),c=!0):a[b][1].substring(0,
433a[b+1][1].length)==a[b+1][1]&&(a[b-1][1]+=a[b+1][1],a[b][1]=a[b][1].substring(a[b+1][1].length)+a[b+1][1],a.splice(b+1,1),c=!0)),b++;c&&this.diff_cleanupMerge(a)};diff_match_patch.prototype.diff_xIndex=function(a,b){var c=0,d=0,e=0,f=0,g;for(g=0;g<a.length;g++){1!==a[g][0]&&(c+=a[g][1].length);-1!==a[g][0]&&(d+=a[g][1].length);if(c>b)break;e=c;f=d}return a.length!=g&&-1===a[g][0]?f:f+(b-e)}; 433a[b+1][1].length)==a[b+1][1]&&(a[b-1][1]+=a[b+1][1],a[b][1]=a[b][1].substring(a[b+1][1].length)+a[b+1][1],a.splice(b+1,1),c=!0)),b++;c&&this.diff_cleanupMerge(a)};diff_match_patch.prototype.diff_xIndex=function(a,b){var c=0,d=0,e=0,f=0,g;for(g=0;g<a.length;g++){1!==a[g][0]&&(c+=a[g][1].length);-1!==a[g][0]&&(d+=a[g][1].length);if(c>b)break;e=c;f=d}return a.length!=g&&-1===a[g][0]?f:f+(b-e)};
434diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],c=/&/g,d=/</g,e=/>/g,f=/\n/g,g=0;g<a.length;g++){var h=a[g][0],j=a[g][1],j=j.replace(c,"&amp;").replace(d,"&lt;").replace(e,"&gt;").replace(f,"<span style=\"color: #dcdcdc;\">&not;</span><br>");switch(h){case 1:b[g]='<ins style="background:#ccffcc; text-decoration: none;">'+j+"</ins>";break;case -1:b[g]='<del style="background:#ffffcc; text-decoration: line-through; color: orange;">'+j+"</del>";break;case 0:b[g]="<span>"+j+"</span>"}}return b.join("")}; 434diff_match_patch.prototype.diff_prettyHtml=function(a){for(var b=[],c=/&/g,d=/</g,e=/>/g,f=/\n/g,g=0;g<a.length;g++){var h=a[g][0],j=a[g][1],j=j.replace(c,"&amp;").replace(d,"&lt;").replace(e,"&gt;").replace(f,"<span style=\"color: #dcdcdc;\">&not;</span><br>");switch(h){case 1:b[g]='<ins style="background:#ccffcc; text-decoration: none;">'+j+"</ins>";break;case -1:b[g]='<del style="background:#ffffcc; text-decoration: line-through; color: orange;">'+j+"</del>";break;case 0:b[g]="<span>"+j+"</span>"}}return b.join("")};
435diff_match_patch.prototype.diff_text1=function(a){for(var b=[],c=0;c<a.length;c++)1!==a[c][0]&&(b[c]=a[c][1]);return b.join("")};diff_match_patch.prototype.diff_text2=function(a){for(var b=[],c=0;c<a.length;c++)-1!==a[c][0]&&(b[c]=a[c][1]);return b.join("")};diff_match_patch.prototype.diff_levenshtein=function(a){for(var b=0,c=0,d=0,e=0;e<a.length;e++){var f=a[e][0],g=a[e][1];switch(f){case 1:c+=g.length;break;case -1:d+=g.length;break;case 0:b+=Math.max(c,d),d=c=0}}return b+=Math.max(c,d)}; 435diff_match_patch.prototype.diff_text1=function(a){for(var b=[],c=0;c<a.length;c++)1!==a[c][0]&&(b[c]=a[c][1]);return b.join("")};diff_match_patch.prototype.diff_text2=function(a){for(var b=[],c=0;c<a.length;c++)-1!==a[c][0]&&(b[c]=a[c][1]);return b.join("")};diff_match_patch.prototype.diff_levenshtein=function(a){for(var b=0,c=0,d=0,e=0;e<a.length;e++){var f=a[e][0],g=a[e][1];switch(f){case 1:c+=g.length;break;case -1:d+=g.length;break;case 0:b+=Math.max(c,d),d=c=0}}return b+=Math.max(c,d)};
436diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],c=0;c<a.length;c++)switch(a[c][0]){case 1:b[c]="+"+encodeURI(a[c][1]);break;case -1:b[c]="-"+a[c][1].length;break;case 0:b[c]="="+a[c][1].length}return b.join("\t").replace(/%20/g," ")}; 436diff_match_patch.prototype.diff_toDelta=function(a){for(var b=[],c=0;c<a.length;c++)switch(a[c][0]){case 1:b[c]="+"+encodeURI(a[c][1]);break;case -1:b[c]="-"+a[c][1].length;break;case 0:b[c]="="+a[c][1].length}return b.join("\t").replace(/%20/g," ")};
437diff_match_patch.prototype.diff_fromDelta=function(a,b){for(var c=[],d=0,e=0,f=b.split(/\t/g),g=0;g<f.length;g++){var h=f[g].substring(1);switch(f[g].charAt(0)){case "+":try{c[d++]=[1,decodeURI(h)]}catch(j){throw Error("Illegal escape in diff_fromDelta: "+h);}break;case "-":case "=":var i=parseInt(h,10);if(isNaN(i)||0>i)throw Error("Invalid number in diff_fromDelta: "+h);h=a.substring(e,e+=i);"="==f[g].charAt(0)?c[d++]=[0,h]:c[d++]=[-1,h];break;default:if(f[g])throw Error("Invalid diff operation in diff_fromDelta: "+ 437diff_match_patch.prototype.diff_fromDelta=function(a,b){for(var c=[],d=0,e=0,f=b.split(/\t/g),g=0;g<f.length;g++){var h=f[g].substring(1);switch(f[g].charAt(0)){case "+":try{c[d++]=[1,decodeURI(h)]}catch(j){throw Error("Illegal escape in diff_fromDelta: "+h);}break;case "-":case "=":var i=parseInt(h,10);if(isNaN(i)||0>i)throw Error("Invalid number in diff_fromDelta: "+h);h=a.substring(e,e+=i);"="==f[g].charAt(0)?c[d++]=[0,h]:c[d++]=[-1,h];break;default:if(f[g])throw Error("Invalid diff operation in diff_fromDelta: "+
438f[g]);}}if(e!=a.length)throw Error("Delta length ("+e+") does not equal source text length ("+a.length+").");return c};diff_match_patch.prototype.match_main=function(a,b,c){if(null==a||null==b||null==c)throw Error("Null input. (match_main)");c=Math.max(0,Math.min(c,a.length));return a==b?0:a.length?a.substring(c,c+b.length)==b?c:this.match_bitap_(a,b,c):-1}; 438f[g]);}}if(e!=a.length)throw Error("Delta length ("+e+") does not equal source text length ("+a.length+").");return c};diff_match_patch.prototype.match_main=function(a,b,c){if(null==a||null==b||null==c)throw Error("Null input. (match_main)");c=Math.max(0,Math.min(c,a.length));return a==b?0:a.length?a.substring(c,c+b.length)==b?c:this.match_bitap_(a,b,c):-1};
439diff_match_patch.prototype.match_bitap_=function(a,b,c){function d(a,d){var e=a/b.length,g=Math.abs(c-d);return!f.Match_Distance?g?1:e:e+g/f.Match_Distance}if(b.length>this.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,c);-1!=h&&(g=Math.min(d(0,h),g),h=a.lastIndexOf(b,c+b.length),-1!=h&&(g=Math.min(d(0,h),g)));for(var j=1<<b.length-1,h=-1,i,k,p=b.length+a.length,q,s=0;s<b.length;s++){i=0;for(k=p;i<k;)d(s,c+ 439diff_match_patch.prototype.match_bitap_=function(a,b,c){function d(a,d){var e=a/b.length,g=Math.abs(c-d);return!f.Match_Distance?g?1:e:e+g/f.Match_Distance}if(b.length>this.Match_MaxBits)throw Error("Pattern too long for this browser.");var e=this.match_alphabet_(b),f=this,g=this.Match_Threshold,h=a.indexOf(b,c);-1!=h&&(g=Math.min(d(0,h),g),h=a.lastIndexOf(b,c+b.length),-1!=h&&(g=Math.min(d(0,h),g)));for(var j=1<<b.length-1,h=-1,i,k,p=b.length+a.length,q,s=0;s<b.length;s++){i=0;for(k=p;i<k;)d(s,c+
440k)<=g?i=k:p=k,k=Math.floor((p-i)/2+i);p=k;i=Math.max(1,c-k+1);var o=Math.min(c+k,a.length)+b.length;k=Array(o+2);for(k[o+1]=(1<<s)-1;o>=i;o--){var v=e[a.charAt(o-1)];k[o]=0===s?(k[o+1]<<1|1)&v:(k[o+1]<<1|1)&v|(q[o+1]|q[o])<<1|1|q[o+1];if(k[o]&j&&(v=d(s,o-1),v<=g))if(g=v,h=o-1,h>c)i=Math.max(1,2*c-h);else break}if(d(s+1,c)>g)break;q=k}return h}; 440k)<=g?i=k:p=k,k=Math.floor((p-i)/2+i);p=k;i=Math.max(1,c-k+1);var o=Math.min(c+k,a.length)+b.length;k=Array(o+2);for(k[o+1]=(1<<s)-1;o>=i;o--){var v=e[a.charAt(o-1)];k[o]=0===s?(k[o+1]<<1|1)&v:(k[o+1]<<1|1)&v|(q[o+1]|q[o])<<1|1|q[o+1];if(k[o]&j&&(v=d(s,o-1),v<=g))if(g=v,h=o-1,h>c)i=Math.max(1,2*c-h);else break}if(d(s+1,c)>g)break;q=k}return h};
441diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},c=0;c<a.length;c++)b[a.charAt(c)]=0;for(c=0;c<a.length;c++)b[a.charAt(c)]|=1<<a.length-c-1;return b}; 441diff_match_patch.prototype.match_alphabet_=function(a){for(var b={},c=0;c<a.length;c++)b[a.charAt(c)]=0;for(c=0;c<a.length;c++)b[a.charAt(c)]|=1<<a.length-c-1;return b};
442diff_match_patch.prototype.patch_addContext_=function(a,b){if(0!=b.length){for(var c=b.substring(a.start2,a.start2+a.length1),d=0;b.indexOf(c)!=b.lastIndexOf(c)&&c.length<this.Match_MaxBits-this.Patch_Margin-this.Patch_Margin;)d+=this.Patch_Margin,c=b.substring(a.start2-d,a.start2+a.length1+d);d+=this.Patch_Margin;(c=b.substring(a.start2-d,a.start2))&&a.diffs.unshift([0,c]);(d=b.substring(a.start2+a.length1,a.start2+a.length1+d))&&a.diffs.push([0,d]);a.start1-=c.length;a.start2-=c.length;a.length1+= 442diff_match_patch.prototype.patch_addContext_=function(a,b){if(0!=b.length){for(var c=b.substring(a.start2,a.start2+a.length1),d=0;b.indexOf(c)!=b.lastIndexOf(c)&&c.length<this.Match_MaxBits-this.Patch_Margin-this.Patch_Margin;)d+=this.Patch_Margin,c=b.substring(a.start2-d,a.start2+a.length1+d);d+=this.Patch_Margin;(c=b.substring(a.start2-d,a.start2))&&a.diffs.unshift([0,c]);(d=b.substring(a.start2+a.length1,a.start2+a.length1+d))&&a.diffs.push([0,d]);a.start1-=c.length;a.start2-=c.length;a.length1+=
443c.length+d.length;a.length2+=c.length+d.length}}; 443c.length+d.length;a.length2+=c.length+d.length}};
444diff_match_patch.prototype.patch_make=function(a,b,c){var d;if("string"==typeof a&&"string"==typeof b&&"undefined"==typeof c)d=a,b=this.diff_main(d,b,!0),2<b.length&&(this.diff_cleanupSemantic(b),this.diff_cleanupEfficiency(b));else if(a&&"object"==typeof a&&"undefined"==typeof b&&"undefined"==typeof c)b=a,d=this.diff_text1(b);else if("string"==typeof a&&b&&"object"==typeof b&&"undefined"==typeof c)d=a;else if("string"==typeof a&&"string"==typeof b&&c&&"object"==typeof c)d=a,b=c;else throw Error("Unknown call format to patch_make."); 444diff_match_patch.prototype.patch_make=function(a,b,c){var d;if("string"==typeof a&&"string"==typeof b&&"undefined"==typeof c)d=a,b=this.diff_main(d,b,!0),2<b.length&&(this.diff_cleanupSemantic(b),this.diff_cleanupEfficiency(b));else if(a&&"object"==typeof a&&"undefined"==typeof b&&"undefined"==typeof c)b=a,d=this.diff_text1(b);else if("string"==typeof a&&b&&"object"==typeof b&&"undefined"==typeof c)d=a;else if("string"==typeof a&&"string"==typeof b&&c&&"object"==typeof c)d=a,b=c;else throw Error("Unknown call format to patch_make.");
445if(0===b.length)return[];for(var c=[],a=new diff_match_patch.patch_obj,e=0,f=0,g=0,h=d,j=0;j<b.length;j++){var i=b[j][0],k=b[j][1];if(!e&&0!==i)a.start1=f,a.start2=g;switch(i){case 1:a.diffs[e++]=b[j];a.length2+=k.length;d=d.substring(0,g)+k+d.substring(g);break;case -1:a.length1+=k.length;a.diffs[e++]=b[j];d=d.substring(0,g)+d.substring(g+k.length);break;case 0:k.length<=2*this.Patch_Margin&&e&&b.length!=j+1?(a.diffs[e++]=b[j],a.length1+=k.length,a.length2+=k.length):k.length>=2*this.Patch_Margin&& 445if(0===b.length)return[];for(var c=[],a=new diff_match_patch.patch_obj,e=0,f=0,g=0,h=d,j=0;j<b.length;j++){var i=b[j][0],k=b[j][1];if(!e&&0!==i)a.start1=f,a.start2=g;switch(i){case 1:a.diffs[e++]=b[j];a.length2+=k.length;d=d.substring(0,g)+k+d.substring(g);break;case -1:a.length1+=k.length;a.diffs[e++]=b[j];d=d.substring(0,g)+d.substring(g+k.length);break;case 0:k.length<=2*this.Patch_Margin&&e&&b.length!=j+1?(a.diffs[e++]=b[j],a.length1+=k.length,a.length2+=k.length):k.length>=2*this.Patch_Margin&&
446e&&(this.patch_addContext_(a,h),c.push(a),a=new diff_match_patch.patch_obj,e=0,h=d,f=g)}1!==i&&(f+=k.length);-1!==i&&(g+=k.length)}e&&(this.patch_addContext_(a,h),c.push(a));return c};diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],c=0;c<a.length;c++){var d=a[c],e=new diff_match_patch.patch_obj;e.diffs=[];for(var f=0;f<d.diffs.length;f++)e.diffs[f]=d.diffs[f].slice();e.start1=d.start1;e.start2=d.start2;e.length1=d.length1;e.length2=d.length2;b[c]=e}return b}; 446e&&(this.patch_addContext_(a,h),c.push(a),a=new diff_match_patch.patch_obj,e=0,h=d,f=g)}1!==i&&(f+=k.length);-1!==i&&(g+=k.length)}e&&(this.patch_addContext_(a,h),c.push(a));return c};diff_match_patch.prototype.patch_deepCopy=function(a){for(var b=[],c=0;c<a.length;c++){var d=a[c],e=new diff_match_patch.patch_obj;e.diffs=[];for(var f=0;f<d.diffs.length;f++)e.diffs[f]=d.diffs[f].slice();e.start1=d.start1;e.start2=d.start2;e.length1=d.length1;e.length2=d.length2;b[c]=e}return b};
447diff_match_patch.prototype.patch_apply=function(a,b){if(0==a.length)return[b,[]];var a=this.patch_deepCopy(a),c=this.patch_addPadding(a),b=c+b+c;this.patch_splitMax(a);for(var d=0,e=[],f=0;f<a.length;f++){var g=a[f].start2+d,h=this.diff_text1(a[f].diffs),j,i=-1;if(h.length>this.Match_MaxBits){if(j=this.match_main(b,h.substring(0,this.Match_MaxBits),g),-1!=j&&(i=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==i||j>=i))j=-1}else j=this.match_main(b,h,g); 447diff_match_patch.prototype.patch_apply=function(a,b){if(0==a.length)return[b,[]];var a=this.patch_deepCopy(a),c=this.patch_addPadding(a),b=c+b+c;this.patch_splitMax(a);for(var d=0,e=[],f=0;f<a.length;f++){var g=a[f].start2+d,h=this.diff_text1(a[f].diffs),j,i=-1;if(h.length>this.Match_MaxBits){if(j=this.match_main(b,h.substring(0,this.Match_MaxBits),g),-1!=j&&(i=this.match_main(b,h.substring(h.length-this.Match_MaxBits),g+h.length-this.Match_MaxBits),-1==i||j>=i))j=-1}else j=this.match_main(b,h,g);
448if(-1==j)e[f]=!1,d-=a[f].length2-a[f].length1;else if(e[f]=!0,d=j-g,g=-1==i?b.substring(j,j+h.length):b.substring(j,i+this.Match_MaxBits),h==g)b=b.substring(0,j)+this.diff_text2(a[f].diffs)+b.substring(j+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);for(var h=0,k,i=0;i<a[f].diffs.length;i++){var p=a[f].diffs[i];0!==p[0]&&(k=this.diff_xIndex(g,h));1===p[0]?b=b.substring(0, 448if(-1==j)e[f]=!1,d-=a[f].length2-a[f].length1;else if(e[f]=!0,d=j-g,g=-1==i?b.substring(j,j+h.length):b.substring(j,i+this.Match_MaxBits),h==g)b=b.substring(0,j)+this.diff_text2(a[f].diffs)+b.substring(j+h.length);else if(g=this.diff_main(h,g,!1),h.length>this.Match_MaxBits&&this.diff_levenshtein(g)/h.length>this.Patch_DeleteThreshold)e[f]=!1;else{this.diff_cleanupSemanticLossless(g);for(var h=0,k,i=0;i<a[f].diffs.length;i++){var p=a[f].diffs[i];0!==p[0]&&(k=this.diff_xIndex(g,h));1===p[0]?b=b.substring(0,
449j+k)+p[1]+b.substring(j+k):-1===p[0]&&(b=b.substring(0,j+k)+b.substring(j+this.diff_xIndex(g,h+p[1].length)));-1!==p[0]&&(h+=p[1].length)}}}b=b.substring(c.length,b.length-c.length);return[b,e]}; 449j+k)+p[1]+b.substring(j+k):-1===p[0]&&(b=b.substring(0,j+k)+b.substring(j+this.diff_xIndex(g,h+p[1].length)));-1!==p[0]&&(h+=p[1].length)}}}b=b.substring(c.length,b.length-c.length);return[b,e]};
450diff_match_patch.prototype.patch_addPadding=function(a){for(var b=this.Patch_Margin,c="",d=1;d<=b;d++)c+=String.fromCharCode(d);for(d=0;d<a.length;d++)a[d].start1+=b,a[d].start2+=b;var d=a[0],e=d.diffs;if(0==e.length||0!=e[0][0])e.unshift([0,c]),d.start1-=b,d.start2-=b,d.length1+=b,d.length2+=b;else if(b>e[0][1].length){var f=b-e[0][1].length;e[0][1]=c.substring(e[0][1].length)+e[0][1];d.start1-=f;d.start2-=f;d.length1+=f;d.length2+=f}d=a[a.length-1];e=d.diffs;0==e.length||0!=e[e.length-1][0]?(e.push([0, 450diff_match_patch.prototype.patch_addPadding=function(a){for(var b=this.Patch_Margin,c="",d=1;d<=b;d++)c+=String.fromCharCode(d);for(d=0;d<a.length;d++)a[d].start1+=b,a[d].start2+=b;var d=a[0],e=d.diffs;if(0==e.length||0!=e[0][0])e.unshift([0,c]),d.start1-=b,d.start2-=b,d.length1+=b,d.length2+=b;else if(b>e[0][1].length){var f=b-e[0][1].length;e[0][1]=c.substring(e[0][1].length)+e[0][1];d.start1-=f;d.start2-=f;d.length1+=f;d.length2+=f}d=a[a.length-1];e=d.diffs;0==e.length||0!=e[e.length-1][0]?(e.push([0,
451c]),d.length1+=b,d.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=c.substring(0,f),d.length1+=f,d.length2+=f);return c}; 451c]),d.length1+=b,d.length2+=b):b>e[e.length-1][1].length&&(f=b-e[e.length-1][1].length,e[e.length-1][1]+=c.substring(0,f),d.length1+=f,d.length2+=f);return c};
452diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,c=0;c<a.length;c++)if(!(a[c].length1<=b)){var d=a[c];a.splice(c--,1);for(var e=d.start1,f=d.start2,g="";0!==d.diffs.length;){var h=new diff_match_patch.patch_obj,j=!0;h.start1=e-g.length;h.start2=f-g.length;if(""!==g)h.length1=h.length2=g.length,h.diffs.push([0,g]);for(;0!==d.diffs.length&&h.length1<b-this.Patch_Margin;){var g=d.diffs[0][0],i=d.diffs[0][1];1===g?(h.length2+=i.length,f+=i.length,h.diffs.push(d.diffs.shift()), 452diff_match_patch.prototype.patch_splitMax=function(a){for(var b=this.Match_MaxBits,c=0;c<a.length;c++)if(!(a[c].length1<=b)){var d=a[c];a.splice(c--,1);for(var e=d.start1,f=d.start2,g="";0!==d.diffs.length;){var h=new diff_match_patch.patch_obj,j=!0;h.start1=e-g.length;h.start2=f-g.length;if(""!==g)h.length1=h.length2=g.length,h.diffs.push([0,g]);for(;0!==d.diffs.length&&h.length1<b-this.Patch_Margin;){var g=d.diffs[0][0],i=d.diffs[0][1];1===g?(h.length2+=i.length,f+=i.length,h.diffs.push(d.diffs.shift()),
453j=!1):-1===g&&1==h.diffs.length&&0==h.diffs[0][0]&&i.length>2*b?(h.length1+=i.length,e+=i.length,j=!1,h.diffs.push([g,i]),d.diffs.shift()):(i=i.substring(0,b-h.length1-this.Patch_Margin),h.length1+=i.length,e+=i.length,0===g?(h.length2+=i.length,f+=i.length):j=!1,h.diffs.push([g,i]),i==d.diffs[0][1]?d.diffs.shift():d.diffs[0][1]=d.diffs[0][1].substring(i.length))}g=this.diff_text2(h.diffs);g=g.substring(g.length-this.Patch_Margin);i=this.diff_text1(d.diffs).substring(0,this.Patch_Margin);""!==i&& 453j=!1):-1===g&&1==h.diffs.length&&0==h.diffs[0][0]&&i.length>2*b?(h.length1+=i.length,e+=i.length,j=!1,h.diffs.push([g,i]),d.diffs.shift()):(i=i.substring(0,b-h.length1-this.Patch_Margin),h.length1+=i.length,e+=i.length,0===g?(h.length2+=i.length,f+=i.length):j=!1,h.diffs.push([g,i]),i==d.diffs[0][1]?d.diffs.shift():d.diffs[0][1]=d.diffs[0][1].substring(i.length))}g=this.diff_text2(h.diffs);g=g.substring(g.length-this.Patch_Margin);i=this.diff_text1(d.diffs).substring(0,this.Patch_Margin);""!==i&&
454(h.length1+=i.length,h.length2+=i.length,0!==h.diffs.length&&0===h.diffs[h.diffs.length-1][0]?h.diffs[h.diffs.length-1][1]+=i:h.diffs.push([0,i]));j||a.splice(++c,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],c=0;c<a.length;c++)b[c]=a[c];return b.join("")}; 454(h.length1+=i.length,h.length2+=i.length,0!==h.diffs.length&&0===h.diffs[h.diffs.length-1][0]?h.diffs[h.diffs.length-1][1]+=i:h.diffs.push([0,i]));j||a.splice(++c,0,h)}}};diff_match_patch.prototype.patch_toText=function(a){for(var b=[],c=0;c<a.length;c++)b[c]=a[c];return b.join("")};
455diff_match_patch.prototype.patch_fromText=function(a){var b=[];if(!a)return b;for(var a=a.split("\n"),c=0,d=/^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/;c<a.length;){var e=a[c].match(d);if(!e)throw Error("Invalid patch string: "+a[c]);var f=new diff_match_patch.patch_obj;b.push(f);f.start1=parseInt(e[1],10);""===e[2]?(f.start1--,f.length1=1):"0"==e[2]?f.length1=0:(f.start1--,f.length1=parseInt(e[2],10));f.start2=parseInt(e[3],10);""===e[4]?(f.start2--,f.length2=1):"0"==e[4]?f.length2=0:(f.start2--,f.length2= 455diff_match_patch.prototype.patch_fromText=function(a){var b=[];if(!a)return b;for(var a=a.split("\n"),c=0,d=/^@@ -(\d+),?(\d*) \+(\d+),?(\d*) @@$/;c<a.length;){var e=a[c].match(d);if(!e)throw Error("Invalid patch string: "+a[c]);var f=new diff_match_patch.patch_obj;b.push(f);f.start1=parseInt(e[1],10);""===e[2]?(f.start1--,f.length1=1):"0"==e[2]?f.length1=0:(f.start1--,f.length1=parseInt(e[2],10));f.start2=parseInt(e[3],10);""===e[4]?(f.start2--,f.length2=1):"0"==e[4]?f.length2=0:(f.start2--,f.length2=
456parseInt(e[4],10));for(c++;c<a.length;){e=a[c].charAt(0);try{var g=decodeURI(a[c].substring(1))}catch(h){throw Error("Illegal escape in patch_fromText: "+g);}if("-"==e)f.diffs.push([-1,g]);else if("+"==e)f.diffs.push([1,g]);else if(" "==e)f.diffs.push([0,g]);else if("@"==e)break;else if(""!==e)throw Error('Invalid patch mode "'+e+'" in: '+g);c++}}return b};diff_match_patch.patch_obj=function(){this.diffs=[];this.start2=this.start1=null;this.length2=this.length1=0}; 456parseInt(e[4],10));for(c++;c<a.length;){e=a[c].charAt(0);try{var g=decodeURI(a[c].substring(1))}catch(h){throw Error("Illegal escape in patch_fromText: "+g);}if("-"==e)f.diffs.push([-1,g]);else if("+"==e)f.diffs.push([1,g]);else if(" "==e)f.diffs.push([0,g]);else if("@"==e)break;else if(""!==e)throw Error('Invalid patch mode "'+e+'" in: '+g);c++}}return b};diff_match_patch.patch_obj=function(){this.diffs=[];this.start2=this.start1=null;this.length2=this.length1=0};
457diff_match_patch.patch_obj.prototype.toString=function(){var a,b;a=0===this.length1?this.start1+",0":1==this.length1?this.start1+1:this.start1+1+","+this.length1;b=0===this.length2?this.start2+",0":1==this.length2?this.start2+1:this.start2+1+","+this.length2;a=["@@ -"+a+" +"+b+" @@\n"];var c;for(b=0;b<this.diffs.length;b++){switch(this.diffs[b][0]){case 1:c="+";break;case -1:c="-";break;case 0:c=" "}a[b+1]=c+encodeURI(this.diffs[b][1])+"\n"}return a.join("").replace(/%20/g," ")}; 457diff_match_patch.patch_obj.prototype.toString=function(){var a,b;a=0===this.length1?this.start1+",0":1==this.length1?this.start1+1:this.start1+1+","+this.length1;b=0===this.length2?this.start2+",0":1==this.length2?this.start2+1:this.start2+1+","+this.length2;a=["@@ -"+a+" +"+b+" @@\n"];var c;for(b=0;b<this.diffs.length;b++){switch(this.diffs[b][0]){case 1:c="+";break;case -1:c="-";break;case 0:c=" "}a[b+1]=c+encodeURI(this.diffs[b][1])+"\n"}return a.join("").replace(/%20/g," ")};
458this.diff_match_patch=diff_match_patch;this.DIFF_DELETE=-1;this.DIFF_INSERT=1;this.DIFF_EQUAL=0;})() 458this.diff_match_patch=diff_match_patch;this.DIFF_DELETE=-1;this.DIFF_INSERT=1;this.DIFF_EQUAL=0;})()
459var dmp = new diff_match_patch(); function diffLaunch(){var text1 = document.getElementById('text').value; var text2 = document.getElementById('text2').value; dmp.Diff_Timeout = 0; dmp.Diff_EditCost = 4; var d = dmp.diff_main(text1, text2); var ds = dmp.diff_prettyHtml(d); document.getElementById('diff').innerHTML = ds; 459var dmp = new diff_match_patch(); function diffLaunch(){var text1 = document.getElementById('text').value; var text2 = document.getElementById('text2').value; dmp.Diff_Timeout = 0; dmp.Diff_EditCost = 4; var d = dmp.diff_main(text1, text2); var ds = dmp.diff_prettyHtml(d); document.getElementById('diff').innerHTML = ds;
460} 460}
461//--><!]]></script> 461//--><!]]></script>
462<title>htmLawed (<?php echo hl_version();?>) test</title> 462<title>htmLawed (<?php echo hl_version();?>) test</title>
463</head> 463</head>
464<body> 464<body>
465<div id="topmost"> 465<div id="topmost">
466 466
467<h5 style="float: left; display: inline; margin-top: 0; margin-bottom: 5px;"><a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/index.php" title="htmLawed home">HTM<big><big>L</big></big>AWED</a> <?php echo hl_version();?> <a href="htmLawedTest.php" title="test home">TEST</a></h5> 467<h5 style="float: left; display: inline; margin-top: 0; margin-bottom: 5px;"><a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/index.php" title="htmLawed home">HTM<big><big>L</big></big>AWED</a> <?php echo hl_version();?> <a href="htmLawedTest.php" title="test home">TEST</a></h5>
468<span style="float: right;" class="help"><a href="htmLawed_README.htm"><span class="notice">htm</span></a> / <a href="htmLawed_README.txt"><span class="notice">txt</span></a> documentation</span><br style="clear:both;" /> 468<span style="float: right;" class="help"><a href="htmLawed_README.htm"><span class="notice">htm</span></a> / <a href="htmLawed_README.txt"><span class="notice">txt</span></a> documentation</span><br style="clear:both;" />
469 469
470<a href="htmLawedTest.php" title="[toggle visibility] type or copy-paste" onclick="javascript:toggle('inputF'); return false;"><span class="notice">Input &raquo;</span> <span class="help" title="limit lower with multibyte characters<?php echo (($_hlimit < $_limit && $_hlimit)? '; limit is '. $_hlimit. ' for viewing binaries' : ''); ?>"><small>(max. <?php echo htmlspecialchars($_limit);?> chars)</small></span></a> 470<a href="htmLawedTest.php" title="[toggle visibility] type or copy-paste" onclick="javascript:toggle('inputF'); return false;"><span class="notice">Input &raquo;</span> <span class="help" title="limit lower with multibyte characters<?php echo (($_hlimit < $_limit && $_hlimit)? '; limit is '. $_hlimit. ' for viewing binaries' : ''); ?>"><small>(max. <?php echo htmlspecialchars($_limit);?> chars)</small></span></a>
471 471
472<form id="testform" name="testform" action="htmLawedTest.php" method="post" accept-charset="<?php echo htmlspecialchars($_POST['enc']); ?>" style="padding:0; margin: 0; display:inline;"> 472<form id="testform" name="testform" action="htmLawedTest.php" method="post" accept-charset="<?php echo htmlspecialchars($_POST['enc']); ?>" style="padding:0; margin: 0; display:inline;">
473 473
474<div id="inputF" style="display: block;"> 474<div id="inputF" style="display: block;">
475 475
476<input type="hidden" name="token" id="token" value="<?php echo $token; ?>" /> 476<input type="hidden" name="token" id="token" value="<?php echo $token; ?>" />
477<div><textarea id="text" class="textarea" name="text" rows="5" cols="100" style="width: 100%;"><?php echo htmlspecialchars($_POST['text']);?></textarea></div> 477<div><textarea id="text" class="textarea" name="text" rows="5" cols="100" style="width: 100%;"><?php echo htmlspecialchars($_POST['text']);?></textarea></div>
478<input type="submit" id="submitF" name="submitF" value="Process" style="float:left;" title="filter using htmLawed" onclick="javascript: sndProc(); return false;" onkeypress="javascript: sndProc(); return false;" /> 478<input type="submit" id="submitF" name="submitF" value="Process" style="float:left;" title="filter using htmLawed" onclick="javascript: sndProc(); return false;" onkeypress="javascript: sndProc(); return false;" />
479 479
480<?php 480<?php
481if($do){ 481if($do){
482 if($validation){ 482 if($validation){
483 echo '<input type="hidden" value="1" name="w3c_validate" id="w3c_validate" />'; 483 echo '<input type="hidden" value="1" name="w3c_validate" id="w3c_validate" />';
484 } 484 }
485?> 485?>
486 486
487<button type="button" title="Raw input rendered as web-page without a doctype or charset declaration" style="float: right;" onclick="javascript: sndUnproc(); return false;" onkeypress="javascript: sndUnproc(); return false;">Render in webpage</button> 487<button type="button" title="Raw input rendered as web-page without a doctype or charset declaration" style="float: right;" onclick="javascript: sndUnproc(); return false;" onkeypress="javascript: sndUnproc(); return false;">Render in webpage</button>
488<button type="button" onclick="javascript:document.getElementById('text').focus();document.getElementById('text').select()" title="select all to copy" style="float:right;">Select all</button> 488<button type="button" onclick="javascript:document.getElementById('text').focus();document.getElementById('text').select()" title="select all to copy" style="float:right;">Select all</button>
489 489
490<?php 490<?php
491if($_w3c_validate && $validation){ 491if($_w3c_validate && $validation){
492?> 492?>
493 493
494<button type="button" title="HTML 4.01 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text', 'html401'); return false;" onkeypress="javascript: sndValidn('text', 'html401'); return false;">Check HTML</button> 494<button type="button" title="HTML 4.01 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text', 'html401'); return false;" onkeypress="javascript: sndValidn('text', 'html401'); return false;">Check HTML</button>
495<button type="button" title="XHTML 1.1 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text', 'xhtml110'); return false;" onkeypress="javascript: sndValidn('text', 'xhtml110'); return false;">Check XHTML</button> 495<button type="button" title="XHTML 1.1 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text', 'xhtml110'); return false;" onkeypress="javascript: sndValidn('text', 'xhtml110'); return false;">Check XHTML</button>
496 496
497<?php 497<?php
498 } 498 }
499} 499}
500else{ 500else{
501 if($_w3c_validate){ 501 if($_w3c_validate){
502 echo '<span style="float: right;" class="help" title="for direct submission of input or output code to W3C validator for (X)HTML validation"><span style="font-size: 85%;">&nbsp;Validator tools: </span><input type="checkbox" value="1" name="w3c_validate" id="w3c_validate" style="vertical-align: middle;"', ($validation ? ' checked="checked"' : ''), ' /></span>'; 502 echo '<span style="float: right;" class="help" title="for direct submission of input or output code to W3C validator for (X)HTML validation"><span style="font-size: 85%;">&nbsp;Validator tools: </span><input type="checkbox" value="1" name="w3c_validate" id="w3c_validate" style="vertical-align: middle;"', ($validation ? ' checked="checked"' : ''), ' /></span>';
503 } 503 }
504} 504}
505?> 505?>
506 506
507<span style="float:right;" class="help" title="IANA-recognized name of the input character-set; can be multiple ;- or space-separated values; may not work in some browsers"><span style="font-size: 85%;">Encoding: </span><input type="text" size="8" id="enc" name="enc" style="vertical-align: middle;" value="<?php echo htmlspecialchars($_POST['enc']); ?>" /></span> 507<span style="float:right;" class="help" title="IANA-recognized name of the input character-set; can be multiple ;- or space-separated values; may not work in some browsers"><span style="font-size: 85%;">Encoding: </span><input type="text" size="8" id="enc" name="enc" style="vertical-align: middle;" value="<?php echo htmlspecialchars($_POST['enc']); ?>" /></span>
508 508
509</div> 509</div>
510<br style="clear:both;" /> 510<br style="clear:both;" />
511 511
512<?php 512<?php
513if($limit_exceeded){ 513if($limit_exceeded){
514 echo '<br /><strong>Input text is too long!</strong><br />'; 514 echo '<br /><strong>Input text is too long!</strong><br />';
515} 515}
516?> 516?>
517 517
518<br /> 518<br />
519 519
520<a href="htmLawedTest.php" title="[toggle visibility] htmLawed configuration" onclick="javascript:toggle('inputC'); return false;"><span class="notice">Settings &raquo;</span></a> 520<a href="htmLawedTest.php" title="[toggle visibility] htmLawed configuration" onclick="javascript:toggle('inputC'); return false;"><span class="notice">Settings &raquo;</span></a>
521 521
522<div id="inputC" style="display: none;"> 522<div id="inputC" style="display: none;">
523<table summary="none"> 523<table summary="none">
524<tr> 524<tr>
525<td><span class="help" title="$config argument">Config:</span></td> 525<td><span class="help" title="$config argument">Config:</span></td>
526<td><ul> 526<td><ul>
527 527
528<?php 528<?php
529$cfg = array( 529$cfg = array(
530'abs_url'=>array('3', '0', 'absolute/relative URL conversion', '-1'), 530'abs_url'=>array('3', '0', 'absolute/relative URL conversion', '-1'),
531'and_mark'=>array('2', '0', 'mark original <em>&amp;</em> chars', '0', 'd'=>1), // 'd' to disable 531'and_mark'=>array('2', '0', 'mark original <em>&amp;</em> chars', '0', 'd'=>1), // 'd' to disable
532'anti_link_spam'=>array('1', '0', 'modify <em>href</em> values as an anti-link spam measure', '0', array(array('30', '1', '', 'regex for extra <em>rel</em>'), array('30', '2', '', 'regex for no <em>href</em>'))), 532'anti_link_spam'=>array('1', '0', 'modify <em>href</em> values as an anti-link spam measure', '0', array(array('30', '1', '', 'regex for extra <em>rel</em>'), array('30', '2', '', 'regex for no <em>href</em>'))),
533'anti_mail_spam'=>array('1', '0', 'replace <em>@</em> in <em>mailto:</em> URLs', '0', '8', 'NO@SPAM', 'replacement'), 533'anti_mail_spam'=>array('1', '0', 'replace <em>@</em> in <em>mailto:</em> URLs', '0', '8', 'NO@SPAM', 'replacement'),
534'balance'=>array('2', '1', 'fix nestings and balance tags', '0'), 534'balance'=>array('2', '1', 'fix nestings and balance tags', '0'),
535'base_url'=>array('', '', 'base URL', '25'), 535'base_url'=>array('', '', 'base URL', '25'),
536'cdata'=>array('4', 'nil', 'allow <em>CDATA</em> sections', 'nil'), 536'cdata'=>array('4', 'nil', 'allow <em>CDATA</em> sections', 'nil'),
537'clean_ms_char'=>array('3', '0', 'replace bad characters introduced by Microsoft apps. like <em>Word</em>', '0'), 537'clean_ms_char'=>array('3', '0', 'replace bad characters introduced by Microsoft apps. like <em>Word</em>', '0'),
538'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'), 538'comment'=>array('4', 'nil', 'allow HTML comments', 'nil'),
539'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'), 539'css_expression'=>array('2', 'nil', 'allow dynamic expressions in CSS style properties', 'nil'),
540'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'), 540'deny_attribute'=>array('1', '0', 'denied attributes', '0', '50', '', 'these'),
541'direct_list_nest'=>array('2', 'nil', 'allow direct nesting of a list within another without requiring it to be a list item', 'nil'), 541'direct_list_nest'=>array('2', 'nil', 'allow direct nesting of a list within another without requiring it to be a list item', 'nil'),
542'elements'=>array('', '', 'allowed elements', '50'), 542'elements'=>array('', '', 'allowed elements', '50'),
543'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'), 543'hexdec_entity'=>array('3', '1', 'convert hexadecimal numeric entities to decimal ones, or vice versa', '0'),
544'hook'=>array('', '', 'name of hook function', '25'), 544'hook'=>array('', '', 'name of hook function', '25'),
545'hook_tag'=>array('', '', 'name of custom function to further check attribute values', '25'), 545'hook_tag'=>array('', '', 'name of custom function to further check attribute values', '25'),
546'keep_bad'=>array('7', '6', 'keep, or remove <em>bad</em> tag content', '0'), 546'keep_bad'=>array('7', '6', 'keep, or remove <em>bad</em> tag content', '0'),
547'lc_std_val'=>array('2', '1', 'lower-case std. attribute values like <em>radio</em>', '0'), 547'lc_std_val'=>array('2', '1', 'lower-case std. attribute values like <em>radio</em>', '0'),
548'make_tag_strict'=>array('3', 'nil', 'transform deprecated elements', 'nil'), 548'make_tag_strict'=>array('3', 'nil', 'transform deprecated elements', 'nil'),
549'named_entity'=>array('2', '1', 'allow named entities, or convert numeric ones', '0'), 549'named_entity'=>array('2', '1', 'allow named entities, or convert numeric ones', '0'),
550'no_deprecated_attr'=>array('3', '1', 'allow deprecated attributes, or transform them', '0'), 550'no_deprecated_attr'=>array('3', '1', 'allow deprecated attributes, or transform them', '0'),
551'parent'=>array('', 'div', 'name of parent element', '25'), 551'parent'=>array('', 'div', 'name of parent element', '25'),
552'safe'=>array('2', '0', 'for most <em>safe</em> HTML', '0'), 552'safe'=>array('2', '0', 'for most <em>safe</em> HTML', '0'),
553'schemes'=>array('', 'href: aim, app, feed, file, ftp, gopher, http, https, irc, javascript, mailto, news, nntp, sftp, ssh, telnet, tel; *:data, file, http, https, javascript', 'allowed URL protocols', '50'), 553'schemes'=>array('', 'href: aim, app, feed, file, ftp, gopher, http, https, irc, javascript, mailto, news, nntp, sftp, ssh, telnet, tel; *:data, file, http, https, javascript', 'allowed URL protocols', '50'),
554'show_setting'=>array('', 'htmLawed_setting', 'variable name to record <em>finalized</em> htmLawed settings', '25', 'd'=>1), 554'show_setting'=>array('', 'htmLawed_setting', 'variable name to record <em>finalized</em> htmLawed settings', '25', 'd'=>1),
555'style_pass'=>array('2', 'nil', 'do not look at <em>style</em> attribute values', 'nil'), 555'style_pass'=>array('2', 'nil', 'do not look at <em>style</em> attribute values', 'nil'),
556'tidy'=>array('3', '0', 'beautify/compact', '-1', '8', '1t1', 'format'), 556'tidy'=>array('3', '0', 'beautify/compact', '-1', '8', '1t1', 'format'),
557'unique_ids'=>array('2', '1', 'unique <em>id</em> values', '0', '8', 'my_', 'prefix'), 557'unique_ids'=>array('2', '1', 'unique <em>id</em> values', '0', '8', 'my_', 'prefix'),
558'valid_xhtml'=>array('2', 'nil', 'auto-set various parameters for most valid XHTML', 'nil'), 558'valid_xhtml'=>array('2', 'nil', 'auto-set various parameters for most valid XHTML', 'nil'),
559'xml:lang'=>array('3', 'nil', 'auto-add <em>xml:lang</em> attribute', '0'), 559'xml:lang'=>array('3', 'nil', 'auto-add <em>xml:lang</em> attribute', '0'),
560); 560);
561foreach($cfg as $k=>$v){ 561foreach($cfg as $k=>$v){
562 echo '<li>', $k, ': '; 562 echo '<li>', $k, ': ';
563 if(!empty($v[0])){ // input radio 563 if(!empty($v[0])){ // input radio
564 $j = $v[3]; 564 $j = $v[3];
565 for($i = $j-1; ++$i < $v[0]+$v[3];++$j){ 565 for($i = $j-1; ++$i < $v[0]+$v[3];++$j){
566 echo '<input type="radio" name="h', $k, '" value="', $i, '"', (!isset($_POST['h'. $k]) ? ($v[1] == $i ? ' checked="checked"' : '') : ($_POST['h'. $k] == $i ? ' checked="checked"' : '')), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />', $i, ' '; 566 echo '<input type="radio" name="h', $k, '" value="', $i, '"', (!isset($_POST['h'. $k]) ? ($v[1] == $i ? ' checked="checked"' : '') : ($_POST['h'. $k] == $i ? ' checked="checked"' : '')), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />', $i, ' ';
567 } 567 }
568 if($v[1] == 'nil'){ 568 if($v[1] == 'nil'){
569 echo '<input type="radio" name="h', $k, '" value="nil"', ((!isset($_POST['h'. $k]) or $_POST['h'. $k] == 'nil') ? ' checked="checked"' : ''), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />not set '; 569 echo '<input type="radio" name="h', $k, '" value="nil"', ((!isset($_POST['h'. $k]) or $_POST['h'. $k] == 'nil') ? ' checked="checked"' : ''), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />not set ';
570 } 570 }
571 if(!empty($v[4])){ // + input text box 571 if(!empty($v[4])){ // + input text box
572 echo '<input type="radio" name="h', $k, '" value="', $j, '"', (((isset($_POST['h'. $k]) && $_POST['h'. $k] == $j) or (!isset($_POST['h'. $k]) && $j == $v[1])) ? ' checked="checked"' : ''), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />'; 572 echo '<input type="radio" name="h', $k, '" value="', $j, '"', (((isset($_POST['h'. $k]) && $_POST['h'. $k] == $j) or (!isset($_POST['h'. $k]) && $j == $v[1])) ? ' checked="checked"' : ''), (isset($v['d']) ? ' disabled="disabled"' : ''), ' />';
573 if(!is_array($v[4])){ 573 if(!is_array($v[4])){
574 echo $v[6], ': <input type="text" size="', $v[4], '" name="h', $k. $j, '" value="', htmlspecialchars(isset($_POST['h'. $k. $j][0]) ? $_POST['h'. $k. $j] : $v[5]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />'; 574 echo $v[6], ': <input type="text" size="', $v[4], '" name="h', $k. $j, '" value="', htmlspecialchars(isset($_POST['h'. $k. $j][0]) ? $_POST['h'. $k. $j] : $v[5]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />';
575 } 575 }
576 else{ 576 else{
577 foreach($v[4] as $z){ 577 foreach($v[4] as $z){
578 echo ' ', $z[3], ': <input type="text" size="', $z[0], '" name="h', $k. $j. $z[1], '" value="', htmlspecialchars(isset($_POST['h'. $k. $j. $z[1]][0]) ? $_POST['h'. $k. $j. $z[1]] : $z[2]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />'; 578 echo ' ', $z[3], ': <input type="text" size="', $z[0], '" name="h', $k. $j. $z[1], '" value="', htmlspecialchars(isset($_POST['h'. $k. $j. $z[1]][0]) ? $_POST['h'. $k. $j. $z[1]] : $z[2]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />';
579 } 579 }
580 } 580 }
581 } 581 }
582 } 582 }
583 elseif(ctype_digit($v[3])){ // input text 583 elseif(ctype_digit($v[3])){ // input text
584 echo '<input type="text" size="', $v[3], '" name="h', $k, '" value="', htmlspecialchars(isset($_POST['h'. $k][0]) ? $_POST['h'. $k] : $v[1]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />'; 584 echo '<input type="text" size="', $v[3], '" name="h', $k, '" value="', htmlspecialchars(isset($_POST['h'. $k][0]) ? $_POST['h'. $k] : $v[1]), '"', (isset($v['d']) ? ' disabled="disabled"' : ''), ' />';
585 } 585 }
586 else{} // text-area 586 else{} // text-area
587 echo ' <span class="help">', $v[2], '</span></li>'; 587 echo ' <span class="help">', $v[2], '</span></li>';
588} 588}
589echo '</ul></td></tr><tr><td><span style="vertical-align: top;" class="help" title="$spec argument: element-specific attribute rules">Spec:</span></td><td><textarea name="spec" id="spec" cols="70" rows="3" style="width:80%;">', htmlspecialchars((isset($_POST['spec']) ? $_POST['spec'] : '')), '</textarea></td></tr></table>'; 589echo '</ul></td></tr><tr><td><span style="vertical-align: top;" class="help" title="$spec argument: element-specific attribute rules">Spec:</span></td><td><textarea name="spec" id="spec" cols="70" rows="3" style="width:80%;">', htmlspecialchars((isset($_POST['spec']) ? $_POST['spec'] : '')), '</textarea></td></tr></table>';
590?> 590?>
591 591
592</div> 592</div>
593</form> 593</form>
594 594
595<?php 595<?php
596if($do){ 596if($do){
597 $cfg = array(); 597 $cfg = array();
598 foreach($_POST as $k=>$v){ 598 foreach($_POST as $k=>$v){
599 if($k[0] == 'h' && $v != 'nil'){ 599 if($k[0] == 'h' && $v != 'nil'){
600 $cfg[substr($k, 1)] = $v; 600 $cfg[substr($k, 1)] = $v;
601 } 601 }
602 } 602 }
603 603
604 if(isset($cfg['anti_link_spam']) && $cfg['anti_link_spam'] && (!empty($cfg['anti_link_spam11']) or !empty($cfg['anti_link_spam12']))){ 604 if(isset($cfg['anti_link_spam']) && $cfg['anti_link_spam'] && (!empty($cfg['anti_link_spam11']) or !empty($cfg['anti_link_spam12']))){
605 $cfg['anti_link_spam'] = array($cfg['anti_link_spam11'], $cfg['anti_link_spam12']); 605 $cfg['anti_link_spam'] = array($cfg['anti_link_spam11'], $cfg['anti_link_spam12']);
606 } 606 }
607 unset($cfg['anti_link_spam11'], $cfg['anti_link_spam12']); 607 unset($cfg['anti_link_spam11'], $cfg['anti_link_spam12']);
608 if(isset($cfg['anti_mail_spam']) && $cfg['anti_mail_spam'] == 1){ 608 if(isset($cfg['anti_mail_spam']) && $cfg['anti_mail_spam'] == 1){
609 $cfg['anti_mail_spam'] = isset($cfg['anti_mail_spam1'][0]) ? $cfg['anti_mail_spam1'] : 0; 609 $cfg['anti_mail_spam'] = isset($cfg['anti_mail_spam1'][0]) ? $cfg['anti_mail_spam1'] : 0;
610 } 610 }
611 unset($cfg['anti_mail_spam11']); 611 unset($cfg['anti_mail_spam11']);
612 if(isset($cfg['deny_attribute']) && $cfg['deny_attribute'] == 1){ 612 if(isset($cfg['deny_attribute']) && $cfg['deny_attribute'] == 1){
613 $cfg['deny_attribute'] = isset($cfg['deny_attribute1'][0]) ? $cfg['deny_attribute1'] : 0; 613 $cfg['deny_attribute'] = isset($cfg['deny_attribute1'][0]) ? $cfg['deny_attribute1'] : 0;
614 } 614 }
615 unset($cfg['deny_attribute1']); 615 unset($cfg['deny_attribute1']);
616 if(isset($cfg['tidy']) && $cfg['tidy'] == 2){ 616 if(isset($cfg['tidy']) && $cfg['tidy'] == 2){
617 $cfg['tidy'] = isset($cfg['tidy2'][0]) ? $cfg['tidy2'] : 0; 617 $cfg['tidy'] = isset($cfg['tidy2'][0]) ? $cfg['tidy2'] : 0;
618 } 618 }
619 unset($cfg['tidy2']); 619 unset($cfg['tidy2']);
620 if(isset($cfg['unique_ids']) && $cfg['unique_ids'] == 2){ 620 if(isset($cfg['unique_ids']) && $cfg['unique_ids'] == 2){
621 $cfg['unique_ids'] = isset($cfg['unique_ids2'][0]) ? $cfg['unique_ids2'] : 1; 621 $cfg['unique_ids'] = isset($cfg['unique_ids2'][0]) ? $cfg['unique_ids2'] : 1;
622 } 622 }
623 unset($cfg['unique_ids2']); 623 unset($cfg['unique_ids2']);
624 unset($cfg['and_mark']); // disabling and_mark 624 unset($cfg['and_mark']); // disabling and_mark
625 625
626 $cfg['show_setting'] = 'hlcfg'; 626 $cfg['show_setting'] = 'hlcfg';
627 $st = microtime(); 627 $st = microtime();
628 $out = htmLawed($_POST['text'], $cfg, $_POST['spec']); 628 $out = htmLawed($_POST['text'], $cfg, $_POST['spec']);
629 $et = microtime(); 629 $et = microtime();
630 echo '<br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'inputR\'); return false;"><span class="notice">Input code &raquo;</span></a> <span class="help" title="tags estimated as half of total &gt; and &lt; chars; values may be inaccurate for non-ASCII text"><small><big>', strlen($_POST['text']), '</big> chars, ~<big>', ($tag = round((substr_count($_POST['text'], '>') + substr_count($_POST['text'], '<'))/2)), '</big> tag', ($tag > 1 ? 's' : ''), '</small>&nbsp;</span><div id="inputR" style="display: none;">', format($_POST['text']), '</div><script type="text/javascript">hl(\'inputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'inputD\'); return false;"><span class="notice">Input binary &raquo;&nbsp;</span></a><div id="inputD" style="display: none;">'. hexdump($_POST['text']). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] finalized internal settings as interpreted by htmLawed; for developers" onclick="javascript:toggle(\'settingF\'); return false;"><span class="notice">Finalized internal settings &raquo;&nbsp;</span></a> <div id="settingF" style="display: none;">$config: ', str_replace(array(' ', "\t", ' '), array(' ', '&nbsp; ', '&nbsp; '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['config'], true)))), '<br />$spec: ', str_replace(array(' ', "\t", ' '), array(' ', '&nbsp; ', '&nbsp; '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['spec'], true)))), '</div><script type="text/javascript">hl(\'settingF\');</script>', '<br /><a href="htmLawedTest.php" title="[toggle visibility] suitable for copy-paste" onclick="javascript:toggle(\'outputF\'); return false;"><span class="notice">Output &raquo;</span></a> <span class="help" title="approx., server-specific value excluding the \'include()\' call"><small>htmLawed processing time <big>', number_format(((substr($et,0,9)) + (substr($et,-10)) - (substr($st,0,9)) - (substr($st,-10))),4), '</big> s</small></span>', (($mem = memory_get_peak_usage()) !== false ? '<span class="help"><small>, peak memory usage <big>'. round(($mem-$pre_mem)/1048576, 2). '</big> <small>MB</small>' : ''), '</small></span><div id="outputF" style="display: block;"><div><textarea id="text2" class="textarea" name="text2" rows="5" cols="100" style="width: 100%;">', htmlspecialchars($out), '</textarea></div><button type="button" title="Filtered input rendered as web-page without a doctype or charset declaration" style="float: right;" onclick="javascript: sndProc2(); return false;" onkeypress="javascript: sndProc2(); return false;">Render in webpage</button><button type="button" onclick="javascript:document.getElementById(\'text2\').focus();document.getElementById(\'text2\').select()" title="select all to copy" style="float:right;">Select all</button>'; 630 echo '<br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'inputR\'); return false;"><span class="notice">Input code &raquo;</span></a> <span class="help" title="tags estimated as half of total &gt; and &lt; chars; values may be inaccurate for non-ASCII text"><small><big>', strlen($_POST['text']), '</big> chars, ~<big>', ($tag = round((substr_count($_POST['text'], '>') + substr_count($_POST['text'], '<'))/2)), '</big> tag', ($tag > 1 ? 's' : ''), '</small>&nbsp;</span><div id="inputR" style="display: none;">', format($_POST['text']), '</div><script type="text/javascript">hl(\'inputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'inputD\'); return false;"><span class="notice">Input binary &raquo;&nbsp;</span></a><div id="inputD" style="display: none;">'. hexdump($_POST['text']). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] finalized internal settings as interpreted by htmLawed; for developers" onclick="javascript:toggle(\'settingF\'); return false;"><span class="notice">Finalized internal settings &raquo;&nbsp;</span></a> <div id="settingF" style="display: none;">$config: ', str_replace(array(' ', "\t", ' '), array(' ', '&nbsp; ', '&nbsp; '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['config'], true)))), '<br />$spec: ', str_replace(array(' ', "\t", ' '), array(' ', '&nbsp; ', '&nbsp; '), nl2br(htmlspecialchars(print_r($GLOBALS['hlcfg']['spec'], true)))), '</div><script type="text/javascript">hl(\'settingF\');</script>', '<br /><a href="htmLawedTest.php" title="[toggle visibility] suitable for copy-paste" onclick="javascript:toggle(\'outputF\'); return false;"><span class="notice">Output &raquo;</span></a> <span class="help" title="approx., server-specific value excluding the \'include()\' call"><small>htmLawed processing time <big>', number_format(((substr($et,0,9)) + (substr($et,-10)) - (substr($st,0,9)) - (substr($st,-10))),4), '</big> s</small></span>', (($mem = memory_get_peak_usage()) !== false ? '<span class="help"><small>, peak memory usage <big>'. round(($mem-$pre_mem)/1048576, 2). '</big> <small>MB</small>' : ''), '</small></span><div id="outputF" style="display: block;"><div><textarea id="text2" class="textarea" name="text2" rows="5" cols="100" style="width: 100%;">', htmlspecialchars($out), '</textarea></div><button type="button" title="Filtered input rendered as web-page without a doctype or charset declaration" style="float: right;" onclick="javascript: sndProc2(); return false;" onkeypress="javascript: sndProc2(); return false;">Render in webpage</button><button type="button" onclick="javascript:document.getElementById(\'text2\').focus();document.getElementById(\'text2\').select()" title="select all to copy" style="float:right;">Select all</button>';
631 if($_w3c_validate && $validation) 631 if($_w3c_validate && $validation)
632 { 632 {
633?> 633?>
634 634
635<button type="button" title="HTML 4.01 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text2', 'html401'); return false;" onkeypress="javascript: sndValidn('text2', 'html401'); return false;">Check HTML</button> 635<button type="button" title="HTML 4.01 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text2', 'html401'); return false;" onkeypress="javascript: sndValidn('text2', 'html401'); return false;">Check HTML</button>
636<button type="button" title="XHTML 1.1 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text2', 'xhtml110'); return false;" onkeypress="javascript: sndValidn('text2', 'xhtml110'); return false;">Check XHTML</button> 636<button type="button" title="XHTML 1.1 W3C online validation" style="float: right;" onclick="javascript: sndValidn('text2', 'xhtml110'); return false;" onkeypress="javascript: sndValidn('text2', 'xhtml110'); return false;">Check XHTML</button>
637 637
638<?php 638<?php
639 } 639 }
640 echo '</div><br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'outputR\'); return false;"><span class="notice">Output code &raquo;</span></a><div id="outputR" style="display: block;">', format($out), '</div><script type="text/javascript">hl(\'outputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'outputD\'); return false;"><span class="notice">Output binary &raquo;</span></a><div id="outputD" style="display: none;">'. hexdump($out). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] inline output-input diff; might not be perfectly accurate, semantically or otherwise " onclick="javascript:toggle(\'diff\'); diffLaunch(); return false;"><span class="notice">Diff &raquo;</span></a> <div id="diff" style="display: none;"></div><br /><a href="htmLawedTest.php" title="[toggle visibility] XHTML 1 Transitional doctype" onclick="javascript:toggle(\'outputH\'); return false;">'; 640 echo '</div><br /><a href="htmLawedTest.php" title="[toggle visibility] syntax-highlighted" onclick="javascript:toggle(\'outputR\'); return false;"><span class="notice">Output code &raquo;</span></a><div id="outputR" style="display: block;">', format($out), '</div><script type="text/javascript">hl(\'outputR\');</script>', (!isset($_POST['text'][$_hlimit]) ? ' <a href="htmLawedTest.php" title="[toggle visibility] hexdump; non-viewable characters like line-returns are shown as dots" onclick="javascript:toggle(\'outputD\'); return false;"><span class="notice">Output binary &raquo;</span></a><div id="outputD" style="display: none;">'. hexdump($out). '</div>' : ''), ' <a href="htmLawedTest.php" title="[toggle visibility] inline output-input diff; might not be perfectly accurate, semantically or otherwise " onclick="javascript:toggle(\'diff\'); diffLaunch(); return false;"><span class="notice">Diff &raquo;</span></a> <div id="diff" style="display: none;"></div><br /><a href="htmLawedTest.php" title="[toggle visibility] XHTML 1 Transitional doctype" onclick="javascript:toggle(\'outputH\'); return false;">';
641} 641}
642else{ 642else{
643?> 643?>
644 644
645<br /> 645<br />
646 646
647<div class="help">Use with a Javascript- and cookie-enabled, relatively new version of a common browser. 647<div class="help">Use with a Javascript- and cookie-enabled, relatively new version of a common browser.
648 648
649<?php echo (file_exists('./htmLawed_TESTCASE.txt') ? '<br /><br />You can use text from <a href="htmLawed_TESTCASE.txt"><span class="notice">this collection of test-cases</span></a> in the input. Set the character encoding of the browser to Unicode/utf-8 before copying.' : ''); ?> 649<?php echo (file_exists('./htmLawed_TESTCASE.txt') ? '<br /><br />You can use text from <a href="htmLawed_TESTCASE.txt"><span class="notice">this collection of test-cases</span></a> in the input. Set the character encoding of the browser to Unicode/utf-8 before copying.' : ''); ?>
650 650
651<br /><br />For anti-XSS tests, try the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawedSafeModeTest.php"><span class="notice">special test-page</span></a> or see <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm"><span class="notice">these results</span></a>. 651<br /><br />For anti-XSS tests, try the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/htmLawedSafeModeTest.php"><span class="notice">special test-page</span></a> or see <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm"><span class="notice">these results</span></a>.
652 652
653<br /><br /><small>Change <em>Encoding</em> to reflect the character encoding of the input text. Even then, it may not work or some characters may not display properly because of variable browser support and because of the form interface. Developers can write some PHP code to capture the filtered input to a file if this is important. 653<br /><br /><small>Change <em>Encoding</em> to reflect the character encoding of the input text. Even then, it may not work or some characters may not display properly because of variable browser support and because of the form interface. Developers can write some PHP code to capture the filtered input to a file if this is important.
654<br /><br />Refer to the htmLawed documentation (<a href="htmLawed_README.htm"><span class="notice">htm</span></a>/<a href="htmLawed_README.txt"><span class="notice">txt</span></a>) for details about <em>Settings</em>, and htmLawed's behavior and limitations. For <em>Settings</em>, incorrectly-specified values like regular expressions are silently ignored. One or more settings form-fields may have been disabled. Some characters are not allowed in the <em>Spec</em> field. 654<br /><br />Refer to the htmLawed documentation (<a href="htmLawed_README.htm"><span class="notice">htm</span></a>/<a href="htmLawed_README.txt"><span class="notice">txt</span></a>) for details about <em>Settings</em>, and htmLawed's behavior and limitations. For <em>Settings</em>, incorrectly-specified values like regular expressions are silently ignored. One or more settings form-fields may have been disabled. Some characters are not allowed in the <em>Spec</em> field.
655 655
656 656
657<br /><br />Hovering the mouse over some of the text can provide additional information in some browsers.</small> 657<br /><br />Hovering the mouse over some of the text can provide additional information in some browsers.</small>
658 658
659<?php 659<?php
660if($_w3c_validate){ 660if($_w3c_validate){
661?> 661?>
662 662
663<small><br /><br />Because of character-encoding issues, the W3C validator (anyway not perfect) may reject validation requests or invalidate otherwise-valid code, esp. if text was copy-pasted in the input box. Local applications like the <em>HTML Validator</em> Firefox browser add-on may be useful in such cases.</small> 663<small><br /><br />Because of character-encoding issues, the W3C validator (anyway not perfect) may reject validation requests or invalidate otherwise-valid code, esp. if text was copy-pasted in the input box. Local applications like the <em>HTML Validator</em> Firefox browser add-on may be useful in such cases.</small>
664 664
665<?php 665<?php
666} 666}
667?> 667?>
668 668
669</div> 669</div>
670 670
671<?php 671<?php
672} 672}
673?> 673?>
674 674
675</div> 675</div>
676</body> 676</body>
677</html> 677</html>
diff --git a/lib/htmlawed/htmLawed_README.htm b/lib/htmlawed/htmLawed_README.htm
index 42e4860..a63513b 100644
--- a/lib/htmlawed/htmLawed_README.htm
+++ b/lib/htmlawed/htmLawed_README.htm
@@ -1,2289 +1,2289 @@
1<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 1<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
2<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en"> 2<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
3<head> 3<head>
4<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> 4<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
5<meta http-equiv="Content-Language" content="en" /> 5<meta http-equiv="Content-Language" content="en" />
6<meta name="description" content="htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter - htmLawed_README.txt - presented with rTxt2htm, a PHP Labware utility" /> 6<meta name="description" content="htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter - htmLawed_README.txt - presented with rTxt2htm, a PHP Labware utility" />
7<meta name="keywords" content="htmLawed, HTM, HTML, HTML5, HTML 5, XHTML, XHTML5, HTML Tidy, converter, filter, formatter, purifier, sanitizer, XSS, input, PHP, software, code, script, security, cross-site scripting, hack, sanitize, remove, standards, tags, attributes, elements, Aria, Ruby, data attributes, tidy, indent, auto-indent, prettify, pretty print, htmLawed_README.txt, rTxt2htm, PHP Labware" /> 7<meta name="keywords" content="htmLawed, HTM, HTML, HTML5, HTML 5, XHTML, XHTML5, HTML Tidy, converter, filter, formatter, purifier, sanitizer, XSS, input, PHP, software, code, script, security, cross-site scripting, hack, sanitize, remove, standards, tags, attributes, elements, Aria, Ruby, data attributes, tidy, indent, auto-indent, prettify, pretty print, htmLawed_README.txt, rTxt2htm, PHP Labware" />
8<style type="text/css" media="all"> 8<style type="text/css" media="all">
9<!--/*--><![CDATA[/*><!--*/ 9<!--/*--><![CDATA[/*><!--*/
10a {text-decoration:none; color: blue;} 10a {text-decoration:none; color: blue;}
11a:hover {color: red;} 11a:hover {color: red;}
12a:visited {color: blue;} 12a:visited {color: blue;}
13body {margin: 0; padding: 0;} 13body {margin: 0; padding: 0;}
14body, div, html, p {font-family: Georgia, 'Times new roman', Times;} 14body, div, html, p {font-family: Georgia, 'Times new roman', Times;}
15code.code {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;} 15code.code {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;}
16div.comment {padding: 5px; color: #999999; font-size: 80%;} 16div.comment {padding: 5px; color: #999999; font-size: 80%;}
17div.comment a {color: #6699cc;} 17div.comment a {color: #6699cc;}
18div#body {width: 70%; margin: 5px; padding: 5px;} /* holds non-toc content */ 18div#body {width: 70%; margin: 5px; padding: 5px;} /* holds non-toc content */
19div#toc {position: fixed; top: 5px; left: 73%; z-index: 2; margin-top: 5px; margin-left: 5px; border: 1px solid gray; padding: 5px; background-color: #ededed; width: 23%; overflow: auto; max-height:94%; font-size: 90%;} /* holds content table (toc) */ 19div#toc {position: fixed; top: 5px; left: 73%; z-index: 2; margin-top: 5px; margin-left: 5px; border: 1px solid gray; padding: 5px; background-color: #ededed; width: 23%; overflow: auto; max-height:94%; font-size: 90%;} /* holds content table (toc) */
20div#top {font-size: 14px; margin: 5px; padding: 5px;} /* holds all content */ 20div#top {font-size: 14px; margin: 5px; padding: 5px;} /* holds all content */
21div.monospace {overflow: auto; font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;} 21div.monospace {overflow: auto; font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;}
22div.sub-section {padding-left: 15px;} 22div.sub-section {padding-left: 15px;}
23div.sub-sub-section {padding-left: 30px;} 23div.sub-sub-section {padding-left: 30px;}
24h1 {font-size: 22px; margin-top: 5px; margin-bottom: 5px;} 24h1 {font-size: 22px; margin-top: 5px; margin-bottom: 5px;}
25h2 {font-size: 20px; float: left; margin-top: 15px; margin-bottom: 5px;} 25h2 {font-size: 20px; float: left; margin-top: 15px; margin-bottom: 5px;}
26h3 {font-size: 18px; float: left; margin-top: 15px; margin-bottom: 5px;} 26h3 {font-size: 18px; float: left; margin-top: 15px; margin-bottom: 5px;}
27h4 {font-size: 16px; float: left; margin-top: 15px; margin-bottom: 5px;} 27h4 {font-size: 16px; float: left; margin-top: 15px; margin-bottom: 5px;}
28hr {margin-top: 15px; margin-bottom: 5px;} 28hr {margin-top: 15px; margin-bottom: 5px;}
29input, textarea {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;} 29input, textarea {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;}
30p.subtle {color: gray; padding: 0; padding-top: 10px; margin: 0;} 30p.subtle {color: gray; padding: 0; padding-top: 10px; margin: 0;}
31p.subtle a, p.subtle a:visited {color: #6699cc;} 31p.subtle a, p.subtle a:visited {color: #6699cc;}
32span.item-no {color: black;} 32span.item-no {color: black;}
33span.subtle {color: gray; margin: 0; padding:0;} 33span.subtle {color: gray; margin: 0; padding:0;}
34span.subtle a, span.subtle a:visited {color: #6699cc;} 34span.subtle a, span.subtle a:visited {color: #6699cc;}
35span.term {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;} 35span.term {font-family: 'Bitstream vera sans mono', 'Courier New', 'Courier', monospace;}
36span.toc-item {color: black;} 36span.toc-item {color: black;}
37span.totop {float: right; margin-top: 15px; margin-bottom: 5px;} 37span.totop {float: right; margin-top: 15px; margin-bottom: 5px;}
38span.totop a, span.totop a:visited {color: #6699cc;} 38span.totop a, span.totop a:visited {color: #6699cc;}
39@media screen { /* fixes for old IE */ 39@media screen { /* fixes for old IE */
40 * html, * html body {overflow-y: auto!important; height: 100%; margin: 0; padding: 0;} 40 * html, * html body {overflow-y: auto!important; height: 100%; margin: 0; padding: 0;}
41 * html div#body {height: 100%; overflow-y: auto; position: relative;} 41 * html div#body {height: 100%; overflow-y: auto; position: relative;}
42 * html div#toc {position: absolute;} 42 * html div#toc {position: absolute;}
43} 43}
44/*]]>*/--> 44/*]]>*/-->
45</style> 45</style>
46<title>htmLawed documentation | htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter</title> 46<title>htmLawed documentation | htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter</title>
47</head> 47</head>
48<body> 48<body>
49<div id="top"> 49<div id="top">
50<h1><a id="peak" name="peak"></a>htmLawed documentation</h1> 50<h1><a id="peak" name="peak"></a>htmLawed documentation</h1>
51 51
52<div id="toc"><span class="toc-item"><a href="#s1"><span class="item-no">1</span>&#160; About htmLawed</a></span><br /> 52<div id="toc"><span class="toc-item"><a href="#s1"><span class="item-no">1</span>&#160; About htmLawed</a></span><br />
53&#160; <span class="toc-item"><a href="#s1.1"><span class="item-no">1.1</span>&#160; Example uses</a></span><br /> 53&#160; <span class="toc-item"><a href="#s1.1"><span class="item-no">1.1</span>&#160; Example uses</a></span><br />
54&#160; <span class="toc-item"><a href="#s1.2"><span class="item-no">1.2</span>&#160; Features</a></span><br /> 54&#160; <span class="toc-item"><a href="#s1.2"><span class="item-no">1.2</span>&#160; Features</a></span><br />
55&#160; <span class="toc-item"><a href="#s1.3"><span class="item-no">1.3</span>&#160; History</a></span><br /> 55&#160; <span class="toc-item"><a href="#s1.3"><span class="item-no">1.3</span>&#160; History</a></span><br />
56&#160; <span class="toc-item"><a href="#s1.4"><span class="item-no">1.4</span>&#160; License &amp; copyright</a></span><br /> 56&#160; <span class="toc-item"><a href="#s1.4"><span class="item-no">1.4</span>&#160; License &amp; copyright</a></span><br />
57&#160; <span class="toc-item"><a href="#s1.5"><span class="item-no">1.5</span>&#160; Terms used here</a></span><br /> 57&#160; <span class="toc-item"><a href="#s1.5"><span class="item-no">1.5</span>&#160; Terms used here</a></span><br />
58&#160; <span class="toc-item"><a href="#s1.6"><span class="item-no">1.6</span>&#160; Availability</a></span><br /> 58&#160; <span class="toc-item"><a href="#s1.6"><span class="item-no">1.6</span>&#160; Availability</a></span><br />
59<span class="toc-item"><a href="#s2"><span class="item-no">2</span>&#160; Usage</a></span><br /> 59<span class="toc-item"><a href="#s2"><span class="item-no">2</span>&#160; Usage</a></span><br />
60&#160; <span class="toc-item"><a href="#s2.1"><span class="item-no">2.1</span>&#160; Simple</a></span><br /> 60&#160; <span class="toc-item"><a href="#s2.1"><span class="item-no">2.1</span>&#160; Simple</a></span><br />
61&#160; <span class="toc-item"><a href="#s2.2"><span class="item-no">2.2</span>&#160; Configuring htmLawed using the <span class="term">$config</span>&#160;argument</a></span><br /> 61&#160; <span class="toc-item"><a href="#s2.2"><span class="item-no">2.2</span>&#160; Configuring htmLawed using the <span class="term">$config</span>&#160;argument</a></span><br />
62&#160; <span class="toc-item"><a href="#s2.3"><span class="item-no">2.3</span>&#160; Extra HTML specifications using the <span class="term">$spec</span>&#160;argument</a></span><br /> 62&#160; <span class="toc-item"><a href="#s2.3"><span class="item-no">2.3</span>&#160; Extra HTML specifications using the <span class="term">$spec</span>&#160;argument</a></span><br />
63&#160; <span class="toc-item"><a href="#s2.4"><span class="item-no">2.4</span>&#160; Performance time &amp; memory usage</a></span><br /> 63&#160; <span class="toc-item"><a href="#s2.4"><span class="item-no">2.4</span>&#160; Performance time &amp; memory usage</a></span><br />
64&#160; <span class="toc-item"><a href="#s2.5"><span class="item-no">2.5</span>&#160; Some security risks to keep in mind</a></span><br /> 64&#160; <span class="toc-item"><a href="#s2.5"><span class="item-no">2.5</span>&#160; Some security risks to keep in mind</a></span><br />
65&#160; <span class="toc-item"><a href="#s2.6"><span class="item-no">2.6</span>&#160; Use with <span class="term">kses()</span>&#160;code</a></span><br /> 65&#160; <span class="toc-item"><a href="#s2.6"><span class="item-no">2.6</span>&#160; Use with <span class="term">kses()</span>&#160;code</a></span><br />
66&#160; <span class="toc-item"><a href="#s2.7"><span class="item-no">2.7</span>&#160; Tolerance for ill-written HTML</a></span><br /> 66&#160; <span class="toc-item"><a href="#s2.7"><span class="item-no">2.7</span>&#160; Tolerance for ill-written HTML</a></span><br />
67&#160; <span class="toc-item"><a href="#s2.8"><span class="item-no">2.8</span>&#160; Limitations &amp; work-arounds</a></span><br /> 67&#160; <span class="toc-item"><a href="#s2.8"><span class="item-no">2.8</span>&#160; Limitations &amp; work-arounds</a></span><br />
68&#160; <span class="toc-item"><a href="#s2.9"><span class="item-no">2.9</span>&#160; Examples of usage</a></span><br /> 68&#160; <span class="toc-item"><a href="#s2.9"><span class="item-no">2.9</span>&#160; Examples of usage</a></span><br />
69<span class="toc-item"><a href="#s3"><span class="item-no">3</span>&#160; Details</a></span><br /> 69<span class="toc-item"><a href="#s3"><span class="item-no">3</span>&#160; Details</a></span><br />
70&#160; <span class="toc-item"><a href="#s3.1"><span class="item-no">3.1</span>&#160; Invalid/dangerous characters</a></span><br /> 70&#160; <span class="toc-item"><a href="#s3.1"><span class="item-no">3.1</span>&#160; Invalid/dangerous characters</a></span><br />
71&#160; <span class="toc-item"><a href="#s3.2"><span class="item-no">3.2</span>&#160; Character references/entities</a></span><br /> 71&#160; <span class="toc-item"><a href="#s3.2"><span class="item-no">3.2</span>&#160; Character references/entities</a></span><br />
72&#160; <span class="toc-item"><a href="#s3.3"><span class="item-no">3.3</span>&#160; HTML elements</a></span><br /> 72&#160; <span class="toc-item"><a href="#s3.3"><span class="item-no">3.3</span>&#160; HTML elements</a></span><br />
73&#160; &#160; <span class="toc-item"><a href="#s3.3.1"><span class="item-no">3.3.1</span>&#160; HTML comments &amp; <span class="term">CDATA</span>&#160;sections</a></span><br /> 73&#160; &#160; <span class="toc-item"><a href="#s3.3.1"><span class="item-no">3.3.1</span>&#160; HTML comments &amp; <span class="term">CDATA</span>&#160;sections</a></span><br />
74&#160; &#160; <span class="toc-item"><a href="#s3.3.2"><span class="item-no">3.3.2</span>&#160; Tag-transformation for better compliance with standards</a></span><br /> 74&#160; &#160; <span class="toc-item"><a href="#s3.3.2"><span class="item-no">3.3.2</span>&#160; Tag-transformation for better compliance with standards</a></span><br />
75&#160; &#160; <span class="toc-item"><a href="#s3.3.3"><span class="item-no">3.3.3</span>&#160; Tag balancing &amp; proper nesting</a></span><br /> 75&#160; &#160; <span class="toc-item"><a href="#s3.3.3"><span class="item-no">3.3.3</span>&#160; Tag balancing &amp; proper nesting</a></span><br />
76&#160; &#160; <span class="toc-item"><a href="#s3.3.4"><span class="item-no">3.3.4</span>&#160; Elements requiring child elements</a></span><br /> 76&#160; &#160; <span class="toc-item"><a href="#s3.3.4"><span class="item-no">3.3.4</span>&#160; Elements requiring child elements</a></span><br />
77&#160; &#160; <span class="toc-item"><a href="#s3.3.5"><span class="item-no">3.3.5</span>&#160; Beautify or compact HTML</a></span><br /> 77&#160; &#160; <span class="toc-item"><a href="#s3.3.5"><span class="item-no">3.3.5</span>&#160; Beautify or compact HTML</a></span><br />
78&#160; <span class="toc-item"><a href="#s3.4"><span class="item-no">3.4</span>&#160; Attributes</a></span><br /> 78&#160; <span class="toc-item"><a href="#s3.4"><span class="item-no">3.4</span>&#160; Attributes</a></span><br />
79&#160; &#160; <span class="toc-item"><a href="#s3.4.1"><span class="item-no">3.4.1</span>&#160; Auto-addition of XHTML-required attributes</a></span><br /> 79&#160; &#160; <span class="toc-item"><a href="#s3.4.1"><span class="item-no">3.4.1</span>&#160; Auto-addition of XHTML-required attributes</a></span><br />
80&#160; &#160; <span class="toc-item"><a href="#s3.4.2"><span class="item-no">3.4.2</span>&#160; Duplicate/invalid <span class="term">id</span>&#160;values</a></span><br /> 80&#160; &#160; <span class="toc-item"><a href="#s3.4.2"><span class="item-no">3.4.2</span>&#160; Duplicate/invalid <span class="term">id</span>&#160;values</a></span><br />
81&#160; &#160; <span class="toc-item"><a href="#s3.4.3"><span class="item-no">3.4.3</span>&#160; URL schemes &amp; scripts in attribute values</a></span><br /> 81&#160; &#160; <span class="toc-item"><a href="#s3.4.3"><span class="item-no">3.4.3</span>&#160; URL schemes &amp; scripts in attribute values</a></span><br />
82&#160; &#160; <span class="toc-item"><a href="#s3.4.4"><span class="item-no">3.4.4</span>&#160; Absolute &amp; relative URLs</a></span><br /> 82&#160; &#160; <span class="toc-item"><a href="#s3.4.4"><span class="item-no">3.4.4</span>&#160; Absolute &amp; relative URLs</a></span><br />
83&#160; &#160; <span class="toc-item"><a href="#s3.4.5"><span class="item-no">3.4.5</span>&#160; Lower-cased, standard attribute values</a></span><br /> 83&#160; &#160; <span class="toc-item"><a href="#s3.4.5"><span class="item-no">3.4.5</span>&#160; Lower-cased, standard attribute values</a></span><br />
84&#160; &#160; <span class="toc-item"><a href="#s3.4.6"><span class="item-no">3.4.6</span>&#160; Transformation of deprecated attributes</a></span><br /> 84&#160; &#160; <span class="toc-item"><a href="#s3.4.6"><span class="item-no">3.4.6</span>&#160; Transformation of deprecated attributes</a></span><br />
85&#160; &#160; <span class="toc-item"><a href="#s3.4.7"><span class="item-no">3.4.7</span>&#160; Anti-spam &amp; <span class="term">href</span></a></span><br /> 85&#160; &#160; <span class="toc-item"><a href="#s3.4.7"><span class="item-no">3.4.7</span>&#160; Anti-spam &amp; <span class="term">href</span></a></span><br />
86&#160; &#160; <span class="toc-item"><a href="#s3.4.8"><span class="item-no">3.4.8</span>&#160; Inline style properties</a></span><br /> 86&#160; &#160; <span class="toc-item"><a href="#s3.4.8"><span class="item-no">3.4.8</span>&#160; Inline style properties</a></span><br />
87&#160; &#160; <span class="toc-item"><a href="#s3.4.9"><span class="item-no">3.4.9</span>&#160; Hook function for tag content</a></span><br /> 87&#160; &#160; <span class="toc-item"><a href="#s3.4.9"><span class="item-no">3.4.9</span>&#160; Hook function for tag content</a></span><br />
88&#160; <span class="toc-item"><a href="#s3.5"><span class="item-no">3.5</span>&#160; Simple configuration directive for most valid XHTML</a></span><br /> 88&#160; <span class="toc-item"><a href="#s3.5"><span class="item-no">3.5</span>&#160; Simple configuration directive for most valid XHTML</a></span><br />
89&#160; <span class="toc-item"><a href="#s3.6"><span class="item-no">3.6</span>&#160; Simple configuration directive for most <em>safe</em>&#160;HTML</a></span><br /> 89&#160; <span class="toc-item"><a href="#s3.6"><span class="item-no">3.6</span>&#160; Simple configuration directive for most <em>safe</em>&#160;HTML</a></span><br />
90&#160; <span class="toc-item"><a href="#s3.7"><span class="item-no">3.7</span>&#160; Using a hook function</a></span><br /> 90&#160; <span class="toc-item"><a href="#s3.7"><span class="item-no">3.7</span>&#160; Using a hook function</a></span><br />
91&#160; <span class="toc-item"><a href="#s3.8"><span class="item-no">3.8</span>&#160; Obtaining <em>finalized</em>&#160;parameter values</a></span><br /> 91&#160; <span class="toc-item"><a href="#s3.8"><span class="item-no">3.8</span>&#160; Obtaining <em>finalized</em>&#160;parameter values</a></span><br />
92&#160; <span class="toc-item"><a href="#s3.9"><span class="item-no">3.9</span>&#160; Retaining non-HTML tags in input with mixed markup</a></span><br /> 92&#160; <span class="toc-item"><a href="#s3.9"><span class="item-no">3.9</span>&#160; Retaining non-HTML tags in input with mixed markup</a></span><br />
93<span class="toc-item"><a href="#s4"><span class="item-no">4</span>&#160; Other</a></span><br /> 93<span class="toc-item"><a href="#s4"><span class="item-no">4</span>&#160; Other</a></span><br />
94&#160; <span class="toc-item"><a href="#s4.1"><span class="item-no">4.1</span>&#160; Support</a></span><br /> 94&#160; <span class="toc-item"><a href="#s4.1"><span class="item-no">4.1</span>&#160; Support</a></span><br />
95&#160; <span class="toc-item"><a href="#s4.2"><span class="item-no">4.2</span>&#160; Known issues</a></span><br /> 95&#160; <span class="toc-item"><a href="#s4.2"><span class="item-no">4.2</span>&#160; Known issues</a></span><br />
96&#160; <span class="toc-item"><a href="#s4.3"><span class="item-no">4.3</span>&#160; Change-log</a></span><br /> 96&#160; <span class="toc-item"><a href="#s4.3"><span class="item-no">4.3</span>&#160; Change-log</a></span><br />
97&#160; <span class="toc-item"><a href="#s4.4"><span class="item-no">4.4</span>&#160; Testing</a></span><br /> 97&#160; <span class="toc-item"><a href="#s4.4"><span class="item-no">4.4</span>&#160; Testing</a></span><br />
98&#160; <span class="toc-item"><a href="#s4.5"><span class="item-no">4.5</span>&#160; Upgrade, &amp; old versions</a></span><br /> 98&#160; <span class="toc-item"><a href="#s4.5"><span class="item-no">4.5</span>&#160; Upgrade, &amp; old versions</a></span><br />
99&#160; <span class="toc-item"><a href="#s4.6"><span class="item-no">4.6</span>&#160; Comparison with <span class="term">HTMLPurifier</span></a></span><br /> 99&#160; <span class="toc-item"><a href="#s4.6"><span class="item-no">4.6</span>&#160; Comparison with <span class="term">HTMLPurifier</span></a></span><br />
100&#160; <span class="toc-item"><a href="#s4.7"><span class="item-no">4.7</span>&#160; Use through application plug-ins/modules</a></span><br /> 100&#160; <span class="toc-item"><a href="#s4.7"><span class="item-no">4.7</span>&#160; Use through application plug-ins/modules</a></span><br />
101&#160; <span class="toc-item"><a href="#s4.8"><span class="item-no">4.8</span>&#160; Use in non-PHP applications</a></span><br /> 101&#160; <span class="toc-item"><a href="#s4.8"><span class="item-no">4.8</span>&#160; Use in non-PHP applications</a></span><br />
102&#160; <span class="toc-item"><a href="#s4.9"><span class="item-no">4.9</span>&#160; Donate</a></span><br /> 102&#160; <span class="toc-item"><a href="#s4.9"><span class="item-no">4.9</span>&#160; Donate</a></span><br />
103&#160; <span class="toc-item"><a href="#s4.10"><span class="item-no">4.10</span>&#160; Acknowledgements</a></span><br /> 103&#160; <span class="toc-item"><a href="#s4.10"><span class="item-no">4.10</span>&#160; Acknowledgements</a></span><br />
104<span class="toc-item"><a href="#s5"><span class="item-no">5</span>&#160; Appendices</a></span><br /> 104<span class="toc-item"><a href="#s5"><span class="item-no">5</span>&#160; Appendices</a></span><br />
105&#160; <span class="toc-item"><a href="#s5.1"><span class="item-no">5.1</span>&#160; Characters discouraged in HTML</a></span><br /> 105&#160; <span class="toc-item"><a href="#s5.1"><span class="item-no">5.1</span>&#160; Characters discouraged in HTML</a></span><br />
106&#160; <span class="toc-item"><a href="#s5.2"><span class="item-no">5.2</span>&#160; Valid attribute-element combinations</a></span><br /> 106&#160; <span class="toc-item"><a href="#s5.2"><span class="item-no">5.2</span>&#160; Valid attribute-element combinations</a></span><br />
107&#160; <span class="toc-item"><a href="#s5.3"><span class="item-no">5.3</span>&#160; CSS 2.1 properties accepting URLs</a></span><br /> 107&#160; <span class="toc-item"><a href="#s5.3"><span class="item-no">5.3</span>&#160; CSS 2.1 properties accepting URLs</a></span><br />
108&#160; <span class="toc-item"><a href="#s5.4"><span class="item-no">5.4</span>&#160; Microsoft Windows 1252 character replacements</a></span><br /> 108&#160; <span class="toc-item"><a href="#s5.4"><span class="item-no">5.4</span>&#160; Microsoft Windows 1252 character replacements</a></span><br />
109&#160; <span class="toc-item"><a href="#s5.5"><span class="item-no">5.5</span>&#160; URL format</a></span><br /> 109&#160; <span class="toc-item"><a href="#s5.5"><span class="item-no">5.5</span>&#160; URL format</a></span><br />
110&#160; <span class="toc-item"><a href="#s5.6"><span class="item-no">5.6</span>&#160; Brief on htmLawed code</a></span></div><!-- ended div toc --> 110&#160; <span class="toc-item"><a href="#s5.6"><span class="item-no">5.6</span>&#160; Brief on htmLawed code</a></span></div><!-- ended div toc -->
111 111
112<div id="body"> 112<div id="body">
113<br /> 113<br />
114<div class="comment">htmLawed_README.txt, 24 September 2019<br /> 114<div class="comment">htmLawed_README.txt, 24 September 2019<br />
115htmLawed 1.2.5, 24 September 2019<br /> 115htmLawed 1.2.5, 24 September 2019<br />
116Copyright Santosh Patnaik<br /> 116Copyright Santosh Patnaik<br />
117Dual licensed with LGPL 3 and GPL 2+<br /> 117Dual licensed with LGPL 3 and GPL 2+<br />
118A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a>&#160;</div> 118A PHP Labware internal utility &#45; <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a>&#160;</div>
119<br /> 119<br />
120 120
121<div class="section"><h2> 121<div class="section"><h2>
122<a name="s1" id="s1"></a><span class="item-no">1</span>&#160; About htmLawed 122<a name="s1" id="s1"></a><span class="item-no">1</span>&#160; About htmLawed
123</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 123</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
124<br /> 124<br />
125&#160; htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and with administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that introduces a security vulnerability or is used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such <em>lawing in</em>&#160;of HTML code ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.<br /> 125&#160; htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and with administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that introduces a security vulnerability or is used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such <em>lawing in</em>&#160;of HTML code ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.<br />
126<br /> 126<br />
127&#160; htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file. It does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML <a href="http://tidy.sourceforge.net">Tidy</a>&#160;application.<br /> 127&#160; htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file. It does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML <a href="http://tidy.sourceforge.net">Tidy</a>&#160;application.<br />
128 128
129<div class="sub-section"><h3> 129<div class="sub-section"><h3>
130<a name="s1.1" id="s1.1"></a><span class="item-no">1.1</span>&#160; Example uses 130<a name="s1.1" id="s1.1"></a><span class="item-no">1.1</span>&#160; Example uses
131</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 131</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
132<br /> 132<br />
133&#160; * &#160;Filtering of text submitted as comments on blogs to allow only certain HTML elements<br /> 133&#160; * &#160;Filtering of text submitted as comments on blogs to allow only certain HTML elements<br />
134<br /> 134<br />
135&#160; * &#160;Making RSS newsfeed items standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant<br /> 135&#160; * &#160;Making RSS newsfeed items standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant<br />
136<br /> 136<br />
137&#160; * &#160;Beautifying or pretty-printing HTML code<br /> 137&#160; * &#160;Beautifying or pretty-printing HTML code<br />
138<br /> 138<br />
139&#160; * &#160;Text processing for stricter XML standard-compliance: e.g., to have lowercased <span class="term">x</span>&#160;in hexadecimal numeric entities becomes necessary if an HTML document with MathML content needs to be served as <span class="term">application/xml</span><br /> 139&#160; * &#160;Text processing for stricter XML standard-compliance: e.g., to have lowercased <span class="term">x</span>&#160;in hexadecimal numeric entities becomes necessary if an HTML document with MathML content needs to be served as <span class="term">application/xml</span><br />
140<br /> 140<br />
141&#160; * &#160;Scraping text from web-pages<br /> 141&#160; * &#160;Scraping text from web-pages<br />
142<br /> 142<br />
143&#160; * &#160;Transforming an HTML element to another<br /> 143&#160; * &#160;Transforming an HTML element to another<br />
144 144
145</div> 145</div>
146<div class="sub-section"><h3> 146<div class="sub-section"><h3>
147<a name="s1.2" id="s1.2"></a><span class="item-no">1.2</span>&#160; Features 147<a name="s1.2" id="s1.2"></a><span class="item-no">1.2</span>&#160; Features
148</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 148</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
149<br /> 149<br />
150&#160; Key: <span class="term">&#42;</span>&#160;security feature, <span class="term">^</span>&#160;standard compliance, <span class="term">~</span>&#160;requires setting right options<br /> 150&#160; Key: <span class="term">&#42;</span>&#160;security feature, <span class="term">^</span>&#160;standard compliance, <span class="term">~</span>&#160;requires setting right options<br />
151<br /> 151<br />
152&#160; htmLawed:<br /> 152&#160; htmLawed:<br />
153<br /> 153<br />
154&#160; * &#160;makes input more <strong>secure</strong>&#160;and <strong>standard-compliant</strong>&#160;for HTML as well as generic <strong>XML</strong>&#160;documents &#160;^<br /> 154&#160; * &#160;makes input more <strong>secure</strong>&#160;and <strong>standard-compliant</strong>&#160;for HTML as well as generic <strong>XML</strong>&#160;documents &#160;^<br />
155&#160; * &#160;supports markup for <strong>HTML 5</strong>&#160;and <strong>microdata, ARIA, Ruby, custom attributes</strong>, etc. &#160;^<br /> 155&#160; * &#160;supports markup for <strong>HTML 5</strong>&#160;and <strong>microdata, ARIA, Ruby, custom attributes</strong>, etc. &#160;^<br />
156&#160; * &#160;can <strong>beautify</strong>&#160;or <strong>compact</strong>&#160;HTML &#160;~<br /> 156&#160; * &#160;can <strong>beautify</strong>&#160;or <strong>compact</strong>&#160;HTML &#160;~<br />
157&#160; * &#160;works with input of almost any <strong>character encoding</strong>&#160;and does not affect it<br /> 157&#160; * &#160;works with input of almost any <strong>character encoding</strong>&#160;and does not affect it<br />
158&#160; * &#160;has good <strong>tolerance for ill-written HTML</strong><br /> 158&#160; * &#160;has good <strong>tolerance for ill-written HTML</strong><br />
159<br /> 159<br />
160&#160; * &#160;can enforce <strong>restricted use of elements</strong>&#160; *~<br /> 160&#160; * &#160;can enforce <strong>restricted use of elements</strong>&#160; *~<br />
161&#160; * &#160;ensures proper closure of empty elements like <span class="term">img</span>&#160; ^<br /> 161&#160; * &#160;ensures proper closure of empty elements like <span class="term">img</span>&#160; ^<br />
162&#160; * &#160;<strong>transforms deprecated elements</strong>&#160;like <span class="term">font</span>&#160; ^~<br /> 162&#160; * &#160;<strong>transforms deprecated elements</strong>&#160;like <span class="term">font</span>&#160; ^~<br />
163&#160; * &#160;can permit HTML <strong>comments</strong>&#160;and <strong>CDATA</strong>&#160;sections &#160;^~<br /> 163&#160; * &#160;can permit HTML <strong>comments</strong>&#160;and <strong>CDATA</strong>&#160;sections &#160;^~<br />
164&#160; * &#160;can permit all elements, including <span class="term">script</span>, <span class="term">object</span>&#160;and <span class="term">form</span>&#160; ~<br /> 164&#160; * &#160;can permit all elements, including <span class="term">script</span>, <span class="term">object</span>&#160;and <span class="term">form</span>&#160; ~<br />
165<br /> 165<br />
166&#160; * &#160;can <strong>restrict attributes by element</strong>&#160; ^~<br /> 166&#160; * &#160;can <strong>restrict attributes by element</strong>&#160; ^~<br />
167&#160; * &#160;removes <strong>invalid attributes</strong>&#160; ^<br /> 167&#160; * &#160;removes <strong>invalid attributes</strong>&#160; ^<br />
168&#160; * &#160;lower-cases element and attribute names &#160;^<br /> 168&#160; * &#160;lower-cases element and attribute names &#160;^<br />
169&#160; * &#160;provides <strong>required attributes</strong>, like <span class="term">alt</span>&#160;for <span class="term">image</span>&#160; ^<br /> 169&#160; * &#160;provides <strong>required attributes</strong>, like <span class="term">alt</span>&#160;for <span class="term">image</span>&#160; ^<br />
170&#160; * &#160;<strong>transforms deprecated attributes</strong>&#160; ^~<br /> 170&#160; * &#160;<strong>transforms deprecated attributes</strong>&#160; ^~<br />
171&#160; * &#160;ensures attributes are <strong>declared only once</strong>&#160; ^<br /> 171&#160; * &#160;ensures attributes are <strong>declared only once</strong>&#160; ^<br />
172&#160; * &#160;permits <strong>custom</strong>, non-standard attributes as well as custom rules for standard attributes &#160;~<br /> 172&#160; * &#160;permits <strong>custom</strong>, non-standard attributes as well as custom rules for standard attributes &#160;~<br />
173<br /> 173<br />
174&#160; * &#160;declares value for <em>empty</em>&#160;(<em>minimized</em>&#160;or <em>boolean</em>) attributes like <span class="term">checked</span>&#160; ^<br /> 174&#160; * &#160;declares value for <em>empty</em>&#160;(<em>minimized</em>&#160;or <em>boolean</em>) attributes like <span class="term">checked</span>&#160; ^<br />
175&#160; * &#160;checks for potentially dangerous attribute values &#160;*~<br /> 175&#160; * &#160;checks for potentially dangerous attribute values &#160;*~<br />
176&#160; * &#160;ensures <strong>unique</strong>&#160;<span class="term">id</span>&#160;attribute values &#160;^~<br /> 176&#160; * &#160;ensures <strong>unique</strong>&#160;<span class="term">id</span>&#160;attribute values &#160;^~<br />
177&#160; * &#160;<strong>double-quotes</strong>&#160;attribute values &#160;^<br /> 177&#160; * &#160;<strong>double-quotes</strong>&#160;attribute values &#160;^<br />
178&#160; * &#160;lower-cases <strong>standard attribute values</strong>&#160;like <span class="term">password</span>&#160; ^<br /> 178&#160; * &#160;lower-cases <strong>standard attribute values</strong>&#160;like <span class="term">password</span>&#160; ^<br />
179<br /> 179<br />
180&#160; * &#160;can restrict <strong>URL protocol/scheme by attribute</strong>&#160; *~<br /> 180&#160; * &#160;can restrict <strong>URL protocol/scheme by attribute</strong>&#160; *~<br />
181&#160; * &#160;can disable <strong>dynamic expressions</strong>&#160;in <span class="term">style</span>&#160;values &#160;*~<br /> 181&#160; * &#160;can disable <strong>dynamic expressions</strong>&#160;in <span class="term">style</span>&#160;values &#160;*~<br />
182<br /> 182<br />
183&#160; * &#160;neutralizes invalid named <strong>character entities</strong>&#160; ^<br /> 183&#160; * &#160;neutralizes invalid named <strong>character entities</strong>&#160; ^<br />
184&#160; * &#160;converts hexadecimal numeric entities to decimal ones, or vice versa &#160;^~<br /> 184&#160; * &#160;converts hexadecimal numeric entities to decimal ones, or vice versa &#160;^~<br />
185&#160; * &#160;converts named entities to numeric ones for generic XML use &#160;^~<br /> 185&#160; * &#160;converts named entities to numeric ones for generic XML use &#160;^~<br />
186<br /> 186<br />
187&#160; * &#160;removes <strong>null</strong>&#160;characters &#160;*<br /> 187&#160; * &#160;removes <strong>null</strong>&#160;characters &#160;*<br />
188&#160; * &#160;neutralizes potentially dangerous proprietary Netscape <strong>Javascript entities</strong>&#160; *<br /> 188&#160; * &#160;neutralizes potentially dangerous proprietary Netscape <strong>Javascript entities</strong>&#160; *<br />
189&#160; * &#160;replaces potentially dangerous <strong>soft-hyphen</strong>&#160;character in URL-accepting attribute values with spaces &#160;*<br /> 189&#160; * &#160;replaces potentially dangerous <strong>soft-hyphen</strong>&#160;character in URL-accepting attribute values with spaces &#160;*<br />
190<br /> 190<br />
191&#160; * &#160;removes common <strong>invalid characters</strong>&#160;not allowed in HTML or XML &#160;^<br /> 191&#160; * &#160;removes common <strong>invalid characters</strong>&#160;not allowed in HTML or XML &#160;^<br />
192&#160; * &#160;replaces <strong>characters from Microsoft applications</strong>&#160;like <span class="term">Word</span>&#160;that are discouraged in HTML or XML &#160;^~<br /> 192&#160; * &#160;replaces <strong>characters from Microsoft applications</strong>&#160;like <span class="term">Word</span>&#160;that are discouraged in HTML or XML &#160;^~<br />
193&#160; * &#160;neutralize entities for characters invalid or discouraged in HTML or XML &#160;^<br /> 193&#160; * &#160;neutralize entities for characters invalid or discouraged in HTML or XML &#160;^<br />
194&#160; * &#160;appropriately neutralize <span class="term">&lt;</span>, <span class="term">&amp;</span>, <span class="term">"</span>, and <span class="term">&gt;</span>&#160;characters &#160;^*<br /> 194&#160; * &#160;appropriately neutralize <span class="term">&lt;</span>, <span class="term">&amp;</span>, <span class="term">"</span>, and <span class="term">&gt;</span>&#160;characters &#160;^*<br />
195<br /> 195<br />
196&#160; * &#160;understands improperly spaced tag content (e.g., spread over more than a line) and properly spaces them<br /> 196&#160; * &#160;understands improperly spaced tag content (e.g., spread over more than a line) and properly spaces them<br />
197&#160; * &#160;attempts to <strong>balance tags</strong>&#160;for well-formedness &#160;^~<br /> 197&#160; * &#160;attempts to <strong>balance tags</strong>&#160;for well-formedness &#160;^~<br />
198&#160; * &#160;understands when <strong>omitable closing tags</strong>&#160;like <span class="term">&lt;/p&gt;</span>&#160;are missing &#160;^~<br /> 198&#160; * &#160;understands when <strong>omitable closing tags</strong>&#160;like <span class="term">&lt;/p&gt;</span>&#160;are missing &#160;^~<br />
199&#160; * &#160;attempts to permit only <strong>validly nested tags</strong>&#160; ^~<br /> 199&#160; * &#160;attempts to permit only <strong>validly nested tags</strong>&#160; ^~<br />
200&#160; * &#160;can <strong>either remove or neutralize bad content</strong>&#160;^~<br /> 200&#160; * &#160;can <strong>either remove or neutralize bad content</strong>&#160;^~<br />
201&#160; * &#160;attempts to <strong>rectify common errors of plain-text misplacement</strong>&#160;(e.g., directly inside <span class="term">blockquote</span>) ^~<br /> 201&#160; * &#160;attempts to <strong>rectify common errors of plain-text misplacement</strong>&#160;(e.g., directly inside <span class="term">blockquote</span>) ^~<br />
202<br /> 202<br />
203&#160; * &#160;has optional <strong>anti-spam</strong>&#160;measures such as addition of <span class="term">rel="nofollow"</span>&#160;and link-disabling &#160;~<br /> 203&#160; * &#160;has optional <strong>anti-spam</strong>&#160;measures such as addition of <span class="term">rel="nofollow"</span>&#160;and link-disabling &#160;~<br />
204&#160; * &#160;optionally makes <strong>relative URLs absolute</strong>, and vice versa &#160;~<br /> 204&#160; * &#160;optionally makes <strong>relative URLs absolute</strong>, and vice versa &#160;~<br />
205<br /> 205<br />
206&#160; * &#160;optionally marks <span class="term">&amp;</span>&#160;to identify the entities for <span class="term">&amp;</span>, <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;introduced by it &#160;~<br /> 206&#160; * &#160;optionally marks <span class="term">&amp;</span>&#160;to identify the entities for <span class="term">&amp;</span>, <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;introduced by it &#160;~<br />
207<br /> 207<br />
208&#160; * &#160;allows deployment of powerful <strong>hook functions</strong>&#160;to <strong>inject</strong>&#160;HTML, <strong>consolidate</strong>&#160;<span class="term">style</span>&#160;attributes to <span class="term">class</span>, finely check attribute values, etc. &#160;~<br /> 208&#160; * &#160;allows deployment of powerful <strong>hook functions</strong>&#160;to <strong>inject</strong>&#160;HTML, <strong>consolidate</strong>&#160;<span class="term">style</span>&#160;attributes to <span class="term">class</span>, finely check attribute values, etc. &#160;~<br />
209 209
210</div> 210</div>
211<div class="sub-section"><h3> 211<div class="sub-section"><h3>
212<a name="s1.3" id="s1.3"></a><span class="item-no">1.3</span>&#160; History 212<a name="s1.3" id="s1.3"></a><span class="item-no">1.3</span>&#160; History
213</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 213</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
214<br /> 214<br />
215&#160; htmLawed was created in 2007 for use with <span class="term">LabWiki</span>, a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like <span class="term">Kses</span>&#160;and <span class="term">HTMLPurifier</span>&#160;were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like <span class="term">HTML Tidy</span>. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the <span class="term">Kses</span>&#160;(version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses <span class="term">Kses</span>; see <a href="#s2.6">section 2.6</a>.). Support for HTML version 5 was added in May 2013 in a beta and in February 2017 in a production release.<br /> 215&#160; htmLawed was created in 2007 for use with <span class="term">LabWiki</span>, a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like <span class="term">Kses</span>&#160;and <span class="term">HTMLPurifier</span>&#160;were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like <span class="term">HTML Tidy</span>. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the <span class="term">Kses</span>&#160;(version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses <span class="term">Kses</span>; see <a href="#s2.6">section 2.6</a>.). Support for HTML version 5 was added in May 2013 in a beta and in February 2017 in a production release.<br />
216<br /> 216<br />
217&#160; See <a href="#s4.3">section 4.3</a>&#160;for a detailed log of changes in htmLawed over the years, and <a href="#s4.10">section 4.10</a>&#160;for acknowledgements.<br /> 217&#160; See <a href="#s4.3">section 4.3</a>&#160;for a detailed log of changes in htmLawed over the years, and <a href="#s4.10">section 4.10</a>&#160;for acknowledgements.<br />
218 218
219</div> 219</div>
220<div class="sub-section"><h3> 220<div class="sub-section"><h3>
221<a name="s1.4" id="s1.4"></a><span class="item-no">1.4</span>&#160; License &amp; copyright 221<a name="s1.4" id="s1.4"></a><span class="item-no">1.4</span>&#160; License &amp; copyright
222</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 222</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
223<br /> 223<br />
224&#160; htmLawed is free and open-source software, copyrighted by Santosh Patnaik, MD, PhD, and dual-licensed with LGPL version <a href="http://www.gnu.org/licenses/lgpl-3.0.txt">3</a>, and GPL version <a href="http://www.gnu.org/licenses/gpl-2.0.txt">2</a>&#160;(or later) licenses.<br /> 224&#160; htmLawed is free and open-source software, copyrighted by Santosh Patnaik, MD, PhD, and dual-licensed with LGPL version <a href="http://www.gnu.org/licenses/lgpl-3.0.txt">3</a>, and GPL version <a href="http://www.gnu.org/licenses/gpl-2.0.txt">2</a>&#160;(or later) licenses.<br />
225 225
226</div> 226</div>
227<div class="sub-section"><h3> 227<div class="sub-section"><h3>
228<a name="s1.5" id="s1.5"></a><span class="item-no">1.5</span>&#160; Terms used here 228<a name="s1.5" id="s1.5"></a><span class="item-no">1.5</span>&#160; Terms used here
229</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 229</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
230<br /> 230<br />
231&#160; In this document, only HTML body-level elements are considered. htmLawed does not have support for head-level elements, <span class="term">body</span>, and the frame-level elements, <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>, and these elements are ignored here.<br /> 231&#160; In this document, only HTML body-level elements are considered. htmLawed does not have support for head-level elements, <span class="term">body</span>, and the frame-level elements, <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>, and these elements are ignored here.<br />
232<br /> 232<br />
233&#160; * &#160;<em>administrator</em>&#160;- or admin; person setting up the code that utilizes htmLawed; also, <em>user</em><br /> 233&#160; * &#160;<em>administrator</em>&#160;- or admin; person setting up the code that utilizes htmLawed; also, <em>user</em><br />
234&#160; * &#160;<em>attributes</em>&#160;- name-value pairs like <span class="term">href="http&#58;//x.com"</span>&#160;in opening tags<br /> 234&#160; * &#160;<em>attributes</em>&#160;- name-value pairs like <span class="term">href="http&#58;//x.com"</span>&#160;in opening tags<br />
235&#160; * &#160;<em>author</em>&#160;- see <em>writer</em><br /> 235&#160; * &#160;<em>author</em>&#160;- see <em>writer</em><br />
236&#160; * &#160;<em>character</em>&#160;- atomic unit of text; internally represented by a numeric <em>code-point</em>&#160;as specified by the <em>encoding</em>&#160;or <em>charset</em>&#160;in use<br /> 236&#160; * &#160;<em>character</em>&#160;- atomic unit of text; internally represented by a numeric <em>code-point</em>&#160;as specified by the <em>encoding</em>&#160;or <em>charset</em>&#160;in use<br />
237&#160; * &#160;<em>entity</em>&#160;- markup like <span class="term">&amp;gt;</span>&#160;and <span class="term">&amp;#160;</span>&#160;used to refer to a character<br /> 237&#160; * &#160;<em>entity</em>&#160;- markup like <span class="term">&amp;gt;</span>&#160;and <span class="term">&amp;#160;</span>&#160;used to refer to a character<br />
238&#160; * &#160;<em>element</em>&#160;- HTML element like <span class="term">a</span>&#160;and <span class="term">img</span><br /> 238&#160; * &#160;<em>element</em>&#160;- HTML element like <span class="term">a</span>&#160;and <span class="term">img</span><br />
239&#160; * &#160;<em>element content</em>&#160;- &#160;content between the opening and closing tags of an element, like <span class="term">click</span>&#160;of the <span class="term">&lt;a href="x"&gt;click&lt;/a&gt;</span>&#160;element<br /> 239&#160; * &#160;<em>element content</em>&#160;- &#160;content between the opening and closing tags of an element, like <span class="term">click</span>&#160;of the <span class="term">&lt;a href="x"&gt;click&lt;/a&gt;</span>&#160;element<br />
240&#160; * &#160;<em>HTML</em>&#160;- implies XHTML unless specified otherwise<br /> 240&#160; * &#160;<em>HTML</em>&#160;- implies XHTML unless specified otherwise<br />
241&#160; * &#160;<em>HTML body</em>&#160;- content in the <em>body</em>&#160;container of an HTML document<br /> 241&#160; * &#160;<em>HTML body</em>&#160;- content in the <em>body</em>&#160;container of an HTML document<br />
242&#160; * &#160;<em>input</em>&#160;- text given to htmLawed to process<br /> 242&#160; * &#160;<em>input</em>&#160;- text given to htmLawed to process<br />
243&#160; * &#160;<em>legal</em>&#160;– standard-compliant; also, <em>valid</em><br /> 243&#160; * &#160;<em>legal</em>&#160;– standard-compliant; also, <em>valid</em><br />
244&#160; * &#160;<em>processing</em>&#160;- involves filtering, correction, etc., of input<br /> 244&#160; * &#160;<em>processing</em>&#160;- involves filtering, correction, etc., of input<br />
245&#160; * &#160;<em>safe</em>&#160;- absence or reduction of certain characters and HTML elements and attributes in HTML of text that can otherwise potentially, and circumstantially, expose text readers to security vulnerabilities like cross-site scripting attacks (XSS)<br /> 245&#160; * &#160;<em>safe</em>&#160;- absence or reduction of certain characters and HTML elements and attributes in HTML of text that can otherwise potentially, and circumstantially, expose text readers to security vulnerabilities like cross-site scripting attacks (XSS)<br />
246&#160; * &#160;<em>scheme</em>&#160;- a URL protocol like <span class="term">http</span>&#160;and <span class="term">ftp</span><br /> 246&#160; * &#160;<em>scheme</em>&#160;- a URL protocol like <span class="term">http</span>&#160;and <span class="term">ftp</span><br />
247&#160; * &#160;<em>specification</em>&#160;- detailed description including rules that define HTML<br /> 247&#160; * &#160;<em>specification</em>&#160;- detailed description including rules that define HTML<br />
248&#160; * &#160;<em>standard</em>&#160;– widely accepted specification<br /> 248&#160; * &#160;<em>standard</em>&#160;– widely accepted specification<br />
249&#160; * &#160;<em>style property</em>&#160;- terms like <span class="term">border</span>&#160;and <span class="term">height</span>&#160;for which declarations are made in values for the <span class="term">style</span>&#160;attribute of elements<br /> 249&#160; * &#160;<em>style property</em>&#160;- terms like <span class="term">border</span>&#160;and <span class="term">height</span>&#160;for which declarations are made in values for the <span class="term">style</span>&#160;attribute of elements<br />
250&#160; * &#160;<em>tag</em>&#160;- markers like <span class="term">&lt;a href="x"&gt;</span>&#160;and <span class="term">&lt;/a&gt;</span>&#160;delineating element content; the opening tag can contain attributes<br /> 250&#160; * &#160;<em>tag</em>&#160;- markers like <span class="term">&lt;a href="x"&gt;</span>&#160;and <span class="term">&lt;/a&gt;</span>&#160;delineating element content; the opening tag can contain attributes<br />
251&#160; * &#160;<em>tag content</em>&#160;- consists of tag markers <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, element names like <span class="term">div</span>, and possibly attributes<br /> 251&#160; * &#160;<em>tag content</em>&#160;- consists of tag markers <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, element names like <span class="term">div</span>, and possibly attributes<br />
252&#160; * &#160;<em>user</em>&#160;- administrator<br /> 252&#160; * &#160;<em>user</em>&#160;- administrator<br />
253&#160; * &#160;<em>valid</em>&#160;- see <em>legal</em><br /> 253&#160; * &#160;<em>valid</em>&#160;- see <em>legal</em><br />
254&#160; * &#160;<em>writer</em>&#160;- end-user like a blog commenter providing the input that is to be processed; also, <em>author</em><br /> 254&#160; * &#160;<em>writer</em>&#160;- end-user like a blog commenter providing the input that is to be processed; also, <em>author</em><br />
255&#160; * &#160;<em>XHTML</em>&#160;- XML-compliant HTML; parsing rules for XHTML are more strict than for regular HTML<br /> 255&#160; * &#160;<em>XHTML</em>&#160;- XML-compliant HTML; parsing rules for XHTML are more strict than for regular HTML<br />
256 256
257</div> 257</div>
258<div class="sub-section"><h3> 258<div class="sub-section"><h3>
259<a name="s1.6" id="s1.6"></a><span class="item-no">1.6</span>&#160; Availability 259<a name="s1.6" id="s1.6"></a><span class="item-no">1.6</span>&#160; Availability
260</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 260</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
261<br /> 261<br />
262&#160; htmLawed can be downloaded for free at its <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">website</a>. Besides the <span class="term">htmLawed.php</span>&#160;file, the download has the htmLawed documentation (this document) in plain <a href="htmLawed_README.txt">text</a>&#160;and <a href="htmLawed_README.htm">HTML</a>&#160;formats, a script for <a href="htmLawedTest.php">testing</a>, and a text file for <a href="htmLawed_TESTCASE.txt">test-cases</a>. htmLawed is also available as a PHP class (OOP code) at its website.<br /> 262&#160; htmLawed can be downloaded for free at its <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">website</a>. Besides the <span class="term">htmLawed.php</span>&#160;file, the download has the htmLawed documentation (this document) in plain <a href="htmLawed_README.txt">text</a>&#160;and <a href="htmLawed_README.htm">HTML</a>&#160;formats, a script for <a href="htmLawedTest.php">testing</a>, and a text file for <a href="htmLawed_TESTCASE.txt">test-cases</a>. htmLawed is also available as a PHP class (OOP code) at its website.<br />
263 263
264</div> 264</div>
265</div> 265</div>
266<div class="section"><h2> 266<div class="section"><h2>
267<a name="s2" id="s2"></a><span class="item-no">2</span>&#160; Usage 267<a name="s2" id="s2"></a><span class="item-no">2</span>&#160; Usage
268</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 268</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
269<br /> 269<br />
270&#160; htmLawed works in PHP version 4.4 or higher. Either <span class="term">include()</span>&#160;the <span class="term">htmLawed.php</span>&#160;file, or copy-paste the entire code.<br /> 270&#160; htmLawed works in PHP version 4.4 or higher. Either <span class="term">include()</span>&#160;the <span class="term">htmLawed.php</span>&#160;file, or copy-paste the entire code.<br />
271<br /> 271<br />
272&#160; To use with PHP 4.3, have the following code included:<br /> 272&#160; To use with PHP 4.3, have the following code included:<br />
273<br /> 273<br />
274 274
275<code class="code">&#160; &#160; if(!function_exists(&#39;ctype_digit&#39;)){</code> 275<code class="code">&#160; &#160; if(!function_exists(&#39;ctype_digit&#39;)){</code>
276<br /> 276<br />
277 277
278<code class="code">&#160; &#160; &#160;function ctype_digit($var){</code> 278<code class="code">&#160; &#160; &#160;function ctype_digit($var){</code>
279<br /> 279<br />
280 280
281<code class="code">&#160; &#160; &#160; return ((int) $var == $var);</code> 281<code class="code">&#160; &#160; &#160; return ((int) $var == $var);</code>
282<br /> 282<br />
283 283
284<code class="code">&#160; &#160; &#160;}</code> 284<code class="code">&#160; &#160; &#160;}</code>
285<br /> 285<br />
286 286
287<code class="code">&#160; &#160; }</code> 287<code class="code">&#160; &#160; }</code>
288<br /> 288<br />
289 289
290<div class="sub-section"><h3> 290<div class="sub-section"><h3>
291<a name="s2.1" id="s2.1"></a><span class="item-no">2.1</span>&#160; Simple 291<a name="s2.1" id="s2.1"></a><span class="item-no">2.1</span>&#160; Simple
292</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 292</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
293<br /> 293<br />
294&#160; The input text to be processed, <span class="term">$text</span>, is passed as an argument of type string; <span class="term">htmLawed()</span>&#160;returns the processed string:<br /> 294&#160; The input text to be processed, <span class="term">$text</span>, is passed as an argument of type string; <span class="term">htmLawed()</span>&#160;returns the processed string:<br />
295<br /> 295<br />
296 296
297<code class="code">&#160; &#160; $processed = htmLawed($text);</code> 297<code class="code">&#160; &#160; $processed = htmLawed($text);</code>
298<br /> 298<br />
299<br /> 299<br />
300&#160; With the <span class="term">htmLawed class</span>&#160;(<a href="#s1.6">section 1.6</a>), usage is:<br /> 300&#160; With the <span class="term">htmLawed class</span>&#160;(<a href="#s1.6">section 1.6</a>), usage is:<br />
301<br /> 301<br />
302 302
303<code class="code">&#160; &#160; $processed = htmLawed&#58;&#58;hl($text);</code> 303<code class="code">&#160; &#160; $processed = htmLawed&#58;&#58;hl($text);</code>
304<br /> 304<br />
305<br /> 305<br />
306&#160; <strong>Notes</strong>: (1) If input is from a <span class="term">$_GET</span>&#160;or <span class="term">$_POST</span>&#160;value, and <span class="term">magic quotes</span>&#160;are enabled on the PHP setup, run <span class="term">stripslashes()</span>&#160;on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, <span class="term">body</span>, and the frame-level elements, <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>.<br /> 306&#160; <strong>Notes</strong>: (1) If input is from a <span class="term">$_GET</span>&#160;or <span class="term">$_POST</span>&#160;value, and <span class="term">magic quotes</span>&#160;are enabled on the PHP setup, run <span class="term">stripslashes()</span>&#160;on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, <span class="term">body</span>, and the frame-level elements, <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>.<br />
307<br /> 307<br />
308&#160; By default, htmLawed will process the text allowing all valid HTML elements/tags and commonly used URL schemes and CSS style properties. It will allow Javascript code, <span class="term">CDATA</span>&#160;sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- <span class="term">$config</span>&#160;and <span class="term">$spec</span>:<br /> 308&#160; By default, htmLawed will process the text allowing all valid HTML elements/tags and commonly used URL schemes and CSS style properties. It will allow Javascript code, <span class="term">CDATA</span>&#160;sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- <span class="term">$config</span>&#160;and <span class="term">$spec</span>:<br />
309<br /> 309<br />
310 310
311<code class="code">&#160; &#160; $processed = htmLawed($text, $config, $spec);</code> 311<code class="code">&#160; &#160; $processed = htmLawed($text, $config, $spec);</code>
312<br /> 312<br />
313<br /> 313<br />
314&#160; The <span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;arguments are detailed below. Some examples are shown in <a href="#s2.9">section 2.9</a>. For maximum protection against <span class="term">XSS</span>&#160;and other security vulnerabilities, consider using the <span class="term">safe</span>&#160;parameter; see <a href="#s3.6">section 3.6</a>.<br /> 314&#160; The <span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;arguments are detailed below. Some examples are shown in <a href="#s2.9">section 2.9</a>. For maximum protection against <span class="term">XSS</span>&#160;and other security vulnerabilities, consider using the <span class="term">safe</span>&#160;parameter; see <a href="#s3.6">section 3.6</a>.<br />
315 315
316</div> 316</div>
317<div class="sub-section"><h3> 317<div class="sub-section"><h3>
318<a name="s2.2" id="s2.2"></a><span class="item-no">2.2</span>&#160; Configuring htmLawed using the <span class="term">$config</span>&#160;argument 318<a name="s2.2" id="s2.2"></a><span class="item-no">2.2</span>&#160; Configuring htmLawed using the <span class="term">$config</span>&#160;argument
319</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 319</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
320<br /> 320<br />
321&#160; <span class="term">$config</span>&#160;instructs htmLawed on how to tackle certain tasks. When <span class="term">$config</span>&#160;is not specified, or not set as an array (e.g., <span class="term">$config = 1</span>), htmLawed will take default actions. One or many of the task-action or parameter-value pairs can be specified in <span class="term">$config</span>&#160;as array key-value pairs. If a parameter is not specified, htmLawed will use the default value for it, indicated further below. In PHP code, parameter values that are integers should not be quoted and should be used as numeric types (unless meant as string/text). Thus, for instance:<br /> 321&#160; <span class="term">$config</span>&#160;instructs htmLawed on how to tackle certain tasks. When <span class="term">$config</span>&#160;is not specified, or not set as an array (e.g., <span class="term">$config = 1</span>), htmLawed will take default actions. One or many of the task-action or parameter-value pairs can be specified in <span class="term">$config</span>&#160;as array key-value pairs. If a parameter is not specified, htmLawed will use the default value for it, indicated further below. In PHP code, parameter values that are integers should not be quoted and should be used as numeric types (unless meant as string/text). Thus, for instance:<br />
322<br /> 322<br />
323 323
324<code class="code">&#160; &#160; $config = array(&#39;comment&#39;=&gt;0, &#39;cdata&#39;=&gt;1, &#39;elements&#39;=&gt;&#39;a, b, strong&#39;);</code> 324<code class="code">&#160; &#160; $config = array(&#39;comment&#39;=&gt;0, &#39;cdata&#39;=&gt;1, &#39;elements&#39;=&gt;&#39;a, b, strong&#39;);</code>
325<br /> 325<br />
326 326
327<code class="code">&#160; &#160; $processed = htmLawed($text, $config);</code> 327<code class="code">&#160; &#160; $processed = htmLawed($text, $config);</code>
328<br /> 328<br />
329<br /> 329<br />
330&#160; Below are the various parameters that can be specified in <span class="term">$config</span>.<br /> 330&#160; Below are the various parameters that can be specified in <span class="term">$config</span>.<br />
331<br /> 331<br />
332&#160; Key: <span class="term">&#42;</span>&#160;default, <span class="term">^</span>&#160;different from htmLawed versions below 1.2, <span class="term">~</span>&#160;different default when <span class="term">valid_xhtml</span>&#160;is set to <span class="term">1</span>&#160;(see <a href="#s3.5">section 3.5</a>), <span class="term">"</span>&#160;different default when <span class="term">safe</span>&#160;is set to <span class="term">1</span>&#160;(see <a href="#s3.6">section 3.6</a>)<br /> 332&#160; Key: <span class="term">&#42;</span>&#160;default, <span class="term">^</span>&#160;different from htmLawed versions below 1.2, <span class="term">~</span>&#160;different default when <span class="term">valid_xhtml</span>&#160;is set to <span class="term">1</span>&#160;(see <a href="#s3.5">section 3.5</a>), <span class="term">"</span>&#160;different default when <span class="term">safe</span>&#160;is set to <span class="term">1</span>&#160;(see <a href="#s3.6">section 3.6</a>)<br />
333<br /> 333<br />
334&#160; <strong>abs_url</strong><br /> 334&#160; <strong>abs_url</strong><br />
335&#160; Make URLs absolute or relative; <span class="term">$config["base_url"]</span>&#160;needs to be set; see <a href="#s3.4.4">section 3.4.4</a><br /> 335&#160; Make URLs absolute or relative; <span class="term">$config["base_url"]</span>&#160;needs to be set; see <a href="#s3.4.4">section 3.4.4</a><br />
336<br /> 336<br />
337&#160; <span class="term">-1</span>&#160;- make relative<br /> 337&#160; <span class="term">-1</span>&#160;- make relative<br />
338&#160; <span class="term">0</span>&#160;- no action &#160;*<br /> 338&#160; <span class="term">0</span>&#160;- no action &#160;*<br />
339&#160; <span class="term">1</span>&#160;- make absolute<br /> 339&#160; <span class="term">1</span>&#160;- make absolute<br />
340<br /> 340<br />
341&#160; <strong>and_mark</strong><br /> 341&#160; <strong>and_mark</strong><br />
342&#160; Mark <span class="term">&amp;</span>&#160;characters in the original input; see <a href="#s3.2">section 3.2</a><br /> 342&#160; Mark <span class="term">&amp;</span>&#160;characters in the original input; see <a href="#s3.2">section 3.2</a><br />
343<br /> 343<br />
344&#160; <strong>anti_link_spam</strong><br /> 344&#160; <strong>anti_link_spam</strong><br />
345&#160; Anti-link-spam measure; see <a href="#s3.4.7">section 3.4.7</a><br /> 345&#160; Anti-link-spam measure; see <a href="#s3.4.7">section 3.4.7</a><br />
346<br /> 346<br />
347&#160; <span class="term">0</span>&#160;- no measure taken &#160;*<br /> 347&#160; <span class="term">0</span>&#160;- no measure taken &#160;*<br />
348&#160; <em>array("regex1", "regex2")</em>&#160;- will ensure a <span class="term">rel</span>&#160;attribute with <span class="term">nofollow</span>&#160;in its value in case the <span class="term">href</span>&#160;attribute value matches the regular expression pattern <span class="term">regex1</span>, and/or will remove <span class="term">href</span>&#160;if its value matches the regular expression pattern <span class="term">regex2</span>. E.g., <span class="term">array("/./", "/&#58;//\W&#42;(?!(abc\.com|xyz\.org))/")</span>; see <a href="#s3.4.7">section 3.4.7</a>&#160;for more.<br /> 348&#160; <em>array("regex1", "regex2")</em>&#160;- will ensure a <span class="term">rel</span>&#160;attribute with <span class="term">nofollow</span>&#160;in its value in case the <span class="term">href</span>&#160;attribute value matches the regular expression pattern <span class="term">regex1</span>, and/or will remove <span class="term">href</span>&#160;if its value matches the regular expression pattern <span class="term">regex2</span>. E.g., <span class="term">array("/./", "/&#58;//\W&#42;(?!(abc\.com|xyz\.org))/")</span>; see <a href="#s3.4.7">section 3.4.7</a>&#160;for more.<br />
349<br /> 349<br />
350&#160; <strong>anti_mail_spam</strong><br /> 350&#160; <strong>anti_mail_spam</strong><br />
351&#160; Anti-mail-spam measure; see <a href="#s3.4.7">section 3.4.7</a><br /> 351&#160; Anti-mail-spam measure; see <a href="#s3.4.7">section 3.4.7</a><br />
352<br /> 352<br />
353&#160; <span class="term">0</span>&#160;- no measure taken &#160;*<br /> 353&#160; <span class="term">0</span>&#160;- no measure taken &#160;*<br />
354&#160; <em>word</em>&#160;- <span class="term">@</span>&#160;in mail address in <span class="term">href</span>&#160;attribute value is replaced with specified <em>word</em><br /> 354&#160; <em>word</em>&#160;- <span class="term">@</span>&#160;in mail address in <span class="term">href</span>&#160;attribute value is replaced with specified <em>word</em><br />
355<br /> 355<br />
356&#160; <strong>balance</strong><br /> 356&#160; <strong>balance</strong><br />
357&#160; Balance tags for well-formedness and proper nesting; see <a href="#s3.3.3">section 3.3.3</a><br /> 357&#160; Balance tags for well-formedness and proper nesting; see <a href="#s3.3.3">section 3.3.3</a><br />
358<br /> 358<br />
359&#160; <span class="term">0</span>&#160;- no<br /> 359&#160; <span class="term">0</span>&#160;- no<br />
360&#160; <span class="term">1</span>&#160;- yes &#160;*<br /> 360&#160; <span class="term">1</span>&#160;- yes &#160;*<br />
361<br /> 361<br />
362&#160; <strong>base_url</strong><br /> 362&#160; <strong>base_url</strong><br />
363&#160; Base URL value that needs to be set if <span class="term">$config["abs_url"]</span>&#160;is not <span class="term">0</span>; see <a href="#s3.4.4">section 3.4.4</a><br /> 363&#160; Base URL value that needs to be set if <span class="term">$config["abs_url"]</span>&#160;is not <span class="term">0</span>; see <a href="#s3.4.4">section 3.4.4</a><br />
364<br /> 364<br />
365&#160; <strong>cdata</strong><br /> 365&#160; <strong>cdata</strong><br />
366&#160; Handling of <span class="term">CDATA</span>&#160;sections; see <a href="#s3.3.1">section 3.3.1</a><br /> 366&#160; Handling of <span class="term">CDATA</span>&#160;sections; see <a href="#s3.3.1">section 3.3.1</a><br />
367<br /> 367<br />
368&#160; <span class="term">0</span>&#160;- don't consider <span class="term">CDATA</span>&#160;sections as markup and proceed as if plain text &#160;"<br /> 368&#160; <span class="term">0</span>&#160;- don't consider <span class="term">CDATA</span>&#160;sections as markup and proceed as if plain text &#160;"<br />
369&#160; <span class="term">1</span>&#160;- remove<br /> 369&#160; <span class="term">1</span>&#160;- remove<br />
370&#160; <span class="term">2</span>&#160;- allow, but neutralize any <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;inside by converting them to named entities<br /> 370&#160; <span class="term">2</span>&#160;- allow, but neutralize any <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;inside by converting them to named entities<br />
371&#160; <span class="term">3</span>&#160;- allow &#160;*<br /> 371&#160; <span class="term">3</span>&#160;- allow &#160;*<br />
372<br /> 372<br />
373&#160; <strong>clean_ms_char</strong><br /> 373&#160; <strong>clean_ms_char</strong><br />
374&#160; Replace <em>discouraged</em>&#160;characters introduced by Microsoft Word, etc.; see <a href="#s3.1">section 3.1</a><br /> 374&#160; Replace <em>discouraged</em>&#160;characters introduced by Microsoft Word, etc.; see <a href="#s3.1">section 3.1</a><br />
375<br /> 375<br />
376&#160; <span class="term">0</span>&#160;- no &#160;*<br /> 376&#160; <span class="term">0</span>&#160;- no &#160;*<br />
377&#160; <span class="term">1</span>&#160;- yes<br /> 377&#160; <span class="term">1</span>&#160;- yes<br />
378&#160; <span class="term">2</span>&#160;- yes, but replace special single &amp; double quotes with ordinary ones<br /> 378&#160; <span class="term">2</span>&#160;- yes, but replace special single &amp; double quotes with ordinary ones<br />
379<br /> 379<br />
380&#160; <strong>comment</strong><br /> 380&#160; <strong>comment</strong><br />
381&#160; Handling of HTML comments; see <a href="#s3.3.1">section 3.3.1</a><br /> 381&#160; Handling of HTML comments; see <a href="#s3.3.1">section 3.3.1</a><br />
382<br /> 382<br />
383&#160; <span class="term">0</span>&#160;- don't consider comments as markup and proceed as if plain text &#160;"<br /> 383&#160; <span class="term">0</span>&#160;- don't consider comments as markup and proceed as if plain text &#160;"<br />
384&#160; <span class="term">1</span>&#160;- remove<br /> 384&#160; <span class="term">1</span>&#160;- remove<br />
385&#160; <span class="term">2</span>&#160;- allow, but neutralize any <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;inside by converting to named entities<br /> 385&#160; <span class="term">2</span>&#160;- allow, but neutralize any <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;inside by converting to named entities<br />
386&#160; <span class="term">3</span>&#160;- allow &#160;*<br /> 386&#160; <span class="term">3</span>&#160;- allow &#160;*<br />
387<br /> 387<br />
388&#160; <strong>css_expression</strong><br /> 388&#160; <strong>css_expression</strong><br />
389&#160; Allow dynamic CSS expression by not removing the expression from CSS property values in <span class="term">style</span>&#160;attributes; see <a href="#s3.4.8">section 3.4.8</a><br /> 389&#160; Allow dynamic CSS expression by not removing the expression from CSS property values in <span class="term">style</span>&#160;attributes; see <a href="#s3.4.8">section 3.4.8</a><br />
390<br /> 390<br />
391&#160; <span class="term">0</span>&#160;- remove &#160;*<br /> 391&#160; <span class="term">0</span>&#160;- remove &#160;*<br />
392&#160; <span class="term">1</span>&#160;- allow<br /> 392&#160; <span class="term">1</span>&#160;- allow<br />
393<br /> 393<br />
394&#160; <strong>deny_attribute</strong><br /> 394&#160; <strong>deny_attribute</strong><br />
395&#160; Denied HTML attributes; see <a href="#s3.4">section 3.4</a><br /> 395&#160; Denied HTML attributes; see <a href="#s3.4">section 3.4</a><br />
396<br /> 396<br />
397&#160; <span class="term">0</span>&#160;- none &#160;*<br /> 397&#160; <span class="term">0</span>&#160;- none &#160;*<br />
398&#160; <em>string</em>&#160;- dictated by values in <em>string</em><br /> 398&#160; <em>string</em>&#160;- dictated by values in <em>string</em><br />
399&#160; <span class="term">on&#42;</span>&#160;- on* event attributes like <span class="term">onfocus</span>&#160;not allowed &#160;"<br /> 399&#160; <span class="term">on&#42;</span>&#160;- on* event attributes like <span class="term">onfocus</span>&#160;not allowed &#160;"<br />
400<br /> 400<br />
401&#160; <strong>direct_nest_list</strong><br /> 401&#160; <strong>direct_nest_list</strong><br />
402&#160; Allow direct nesting of a list within another without requiring it to be a list item; see <a href="#s3.3.4">section 3.3.4</a><br /> 402&#160; Allow direct nesting of a list within another without requiring it to be a list item; see <a href="#s3.3.4">section 3.3.4</a><br />
403<br /> 403<br />
404&#160; <span class="term">0</span>&#160;- no &#160;*<br /> 404&#160; <span class="term">0</span>&#160;- no &#160;*<br />
405&#160; <span class="term">1</span>&#160;- yes<br /> 405&#160; <span class="term">1</span>&#160;- yes<br />
406<br /> 406<br />
407&#160; <strong>elements</strong><br /> 407&#160; <strong>elements</strong><br />
408&#160; Allowed HTML elements; see <a href="#s3.3">section 3.3</a><br /> 408&#160; Allowed HTML elements; see <a href="#s3.3">section 3.3</a><br />
409<br /> 409<br />
410&#160; <em>all</em>&#160;- *^<br /> 410&#160; <em>all</em>&#160;- *^<br />
411&#160; <span class="term">&#42; -acronym -big -center -dir -font -isindex -s -strike -tt</span>&#160;- &#160;~^<br /> 411&#160; <span class="term">&#42; -acronym -big -center -dir -font -isindex -s -strike -tt</span>&#160;- &#160;~^<br />
412&#160; <em>applet, audio, canvas, embed, iframe, object, script, and video elements not allowed</em>&#160;- &#160;"^<br /> 412&#160; <em>applet, audio, canvas, embed, iframe, object, script, and video elements not allowed</em>&#160;- &#160;"^<br />
413<br /> 413<br />
414&#160; <strong>hexdec_entity</strong><br /> 414&#160; <strong>hexdec_entity</strong><br />
415&#160; Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see <a href="#s3.2">section 3.2</a><br /> 415&#160; Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see <a href="#s3.2">section 3.2</a><br />
416<br /> 416<br />
417&#160; <span class="term">0</span>&#160;- no<br /> 417&#160; <span class="term">0</span>&#160;- no<br />
418&#160; <span class="term">1</span>&#160;- yes &#160;*<br /> 418&#160; <span class="term">1</span>&#160;- yes &#160;*<br />
419&#160; <span class="term">2</span>&#160;- convert decimal to hexadecimal ones<br /> 419&#160; <span class="term">2</span>&#160;- convert decimal to hexadecimal ones<br />
420<br /> 420<br />
421&#160; <strong>hook</strong><br /> 421&#160; <strong>hook</strong><br />
422&#160; Name of an optional hook function to alter the input string, <span class="term">$config</span>&#160;or <span class="term">$spec</span>&#160;before htmLawed enters the main phase of its work; see <a href="#s3.7">section 3.7</a><br /> 422&#160; Name of an optional hook function to alter the input string, <span class="term">$config</span>&#160;or <span class="term">$spec</span>&#160;before htmLawed enters the main phase of its work; see <a href="#s3.7">section 3.7</a><br />
423<br /> 423<br />
424&#160; <span class="term">0</span>&#160;- no hook function &#160;*<br /> 424&#160; <span class="term">0</span>&#160;- no hook function &#160;*<br />
425&#160; <em>name</em>&#160;- <em>name</em>&#160;is name of the hook function<br /> 425&#160; <em>name</em>&#160;- <em>name</em>&#160;is name of the hook function<br />
426<br /> 426<br />
427&#160; <strong>hook_tag</strong><br /> 427&#160; <strong>hook_tag</strong><br />
428&#160; Name of an optional hook function to alter tag content finalized by htmLawed; see <a href="#s3.4.9">section 3.4.9</a><br /> 428&#160; Name of an optional hook function to alter tag content finalized by htmLawed; see <a href="#s3.4.9">section 3.4.9</a><br />
429<br /> 429<br />
430&#160; <span class="term">0</span>&#160;- no hook function &#160;*<br /> 430&#160; <span class="term">0</span>&#160;- no hook function &#160;*<br />
431&#160; <em>name</em>&#160;- <em>name</em>&#160;is name of the hook function<br /> 431&#160; <em>name</em>&#160;- <em>name</em>&#160;is name of the hook function<br />
432<br /> 432<br />
433&#160; <strong>keep_bad</strong><br /> 433&#160; <strong>keep_bad</strong><br />
434&#160; Neutralize <em>bad</em>&#160;tags by converting their <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;characters to entities, or remove them; see <a href="#s3.3.3">section 3.3.3</a><br /> 434&#160; Neutralize <em>bad</em>&#160;tags by converting their <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;characters to entities, or remove them; see <a href="#s3.3.3">section 3.3.3</a><br />
435<br /> 435<br />
436&#160; <span class="term">0</span>&#160;- remove<br /> 436&#160; <span class="term">0</span>&#160;- remove<br />
437&#160; <span class="term">1</span>&#160;- neutralize both tags and element content<br /> 437&#160; <span class="term">1</span>&#160;- neutralize both tags and element content<br />
438&#160; <span class="term">2</span>&#160;- remove tags but neutralize element content<br /> 438&#160; <span class="term">2</span>&#160;- remove tags but neutralize element content<br />
439&#160; <span class="term">3</span>&#160;and <span class="term">4</span>&#160;- like <span class="term">1</span>&#160;and <span class="term">2</span>&#160;but remove if text (<span class="term">pcdata</span>) is invalid in parent element<br /> 439&#160; <span class="term">3</span>&#160;and <span class="term">4</span>&#160;- like <span class="term">1</span>&#160;and <span class="term">2</span>&#160;but remove if text (<span class="term">pcdata</span>) is invalid in parent element<br />
440&#160; <span class="term">5</span>&#160;and <span class="term">6</span>&#160;* - &#160;like <span class="term">3</span>&#160;and <span class="term">4</span>&#160;but line-breaks, tabs and spaces are left<br /> 440&#160; <span class="term">5</span>&#160;and <span class="term">6</span>&#160;* - &#160;like <span class="term">3</span>&#160;and <span class="term">4</span>&#160;but line-breaks, tabs and spaces are left<br />
441<br /> 441<br />
442&#160; <strong>lc_std_val</strong><br /> 442&#160; <strong>lc_std_val</strong><br />
443&#160; For XHTML compliance, predefined, standard attribute values, like <span class="term">get</span>&#160;for the <span class="term">method</span>&#160;attribute of <span class="term">form</span>, must be lowercased; see <a href="#s3.4.5">section 3.4.5</a><br /> 443&#160; For XHTML compliance, predefined, standard attribute values, like <span class="term">get</span>&#160;for the <span class="term">method</span>&#160;attribute of <span class="term">form</span>, must be lowercased; see <a href="#s3.4.5">section 3.4.5</a><br />
444<br /> 444<br />
445&#160; <span class="term">0</span>&#160;- no<br /> 445&#160; <span class="term">0</span>&#160;- no<br />
446&#160; <span class="term">1</span>&#160;- yes &#160;*<br /> 446&#160; <span class="term">1</span>&#160;- yes &#160;*<br />
447<br /> 447<br />
448&#160; <strong>make_tag_strict</strong><br /> 448&#160; <strong>make_tag_strict</strong><br />
449&#160; Transform or remove these deprecated HTML elements, even if they are allowed by the admin: acronym, applet, big, center, dir, font, isindex, s, strike, tt; see <a href="#s3.3.2">section 3.3.2</a><br /> 449&#160; Transform or remove these deprecated HTML elements, even if they are allowed by the admin: acronym, applet, big, center, dir, font, isindex, s, strike, tt; see <a href="#s3.3.2">section 3.3.2</a><br />
450<br /> 450<br />
451&#160; <span class="term">0</span>&#160;- no<br /> 451&#160; <span class="term">0</span>&#160;- no<br />
452&#160; <span class="term">1</span>&#160;- yes, but leave <span class="term">applet</span>&#160;and <span class="term">isindex</span>&#160;that currently cannot be transformed &#160;*^<br /> 452&#160; <span class="term">1</span>&#160;- yes, but leave <span class="term">applet</span>&#160;and <span class="term">isindex</span>&#160;that currently cannot be transformed &#160;*^<br />
453&#160; <span class="term">2</span>&#160;- yes, removing <span class="term">applet</span>&#160;and <span class="term">isindex</span>&#160;elements and their contents (nested elements remain) &#160;~^<br /> 453&#160; <span class="term">2</span>&#160;- yes, removing <span class="term">applet</span>&#160;and <span class="term">isindex</span>&#160;elements and their contents (nested elements remain) &#160;~^<br />
454<br /> 454<br />
455&#160; <strong>named_entity</strong><br /> 455&#160; <strong>named_entity</strong><br />
456&#160; Allow non-universal named HTML entities, or convert to numeric ones; see <a href="#s3.2">section 3.2</a><br /> 456&#160; Allow non-universal named HTML entities, or convert to numeric ones; see <a href="#s3.2">section 3.2</a><br />
457<br /> 457<br />
458&#160; <span class="term">0</span>&#160;- convert<br /> 458&#160; <span class="term">0</span>&#160;- convert<br />
459&#160; <span class="term">1</span>&#160;- allow &#160;*<br /> 459&#160; <span class="term">1</span>&#160;- allow &#160;*<br />
460<br /> 460<br />
461&#160; <strong>no_deprecated_attr</strong><br /> 461&#160; <strong>no_deprecated_attr</strong><br />
462&#160; Allow deprecated attributes or transform them; see <a href="#s3.4.6">section 3.4.6</a><br /> 462&#160; Allow deprecated attributes or transform them; see <a href="#s3.4.6">section 3.4.6</a><br />
463<br /> 463<br />
464&#160; <span class="term">0</span>&#160;- allow<br /> 464&#160; <span class="term">0</span>&#160;- allow<br />
465&#160; <span class="term">1</span>&#160;- transform, but <span class="term">name</span>&#160;attributes for <span class="term">a</span>&#160;and <span class="term">map</span>&#160;are retained &#160;*<br /> 465&#160; <span class="term">1</span>&#160;- transform, but <span class="term">name</span>&#160;attributes for <span class="term">a</span>&#160;and <span class="term">map</span>&#160;are retained &#160;*<br />
466&#160; <span class="term">2</span>&#160;- transform<br /> 466&#160; <span class="term">2</span>&#160;- transform<br />
467<br /> 467<br />
468&#160; <strong>parent</strong><br /> 468&#160; <strong>parent</strong><br />
469&#160; Name of the parent element, possibly imagined, that will hold the input; see <a href="#s3.3">section 3.3</a><br /> 469&#160; Name of the parent element, possibly imagined, that will hold the input; see <a href="#s3.3">section 3.3</a><br />
470<br /> 470<br />
471&#160; <strong>safe</strong><br /> 471&#160; <strong>safe</strong><br />
472&#160; Magic parameter to make input the most secure against vulnerabilities like XSS without needing to specify other relevant <span class="term">$config</span>&#160;parameters; see <a href="#s3.6">section 3.6</a><br /> 472&#160; Magic parameter to make input the most secure against vulnerabilities like XSS without needing to specify other relevant <span class="term">$config</span>&#160;parameters; see <a href="#s3.6">section 3.6</a><br />
473<br /> 473<br />
474&#160; <span class="term">0</span>&#160;- no &#160;*<br /> 474&#160; <span class="term">0</span>&#160;- no &#160;*<br />
475&#160; <span class="term">1</span>&#160;- will auto-adjust other relevant <span class="term">$config</span>&#160;parameters (indicated by <span class="term">"</span>&#160;in this list) &#160;^<br /> 475&#160; <span class="term">1</span>&#160;- will auto-adjust other relevant <span class="term">$config</span>&#160;parameters (indicated by <span class="term">"</span>&#160;in this list) &#160;^<br />
476<br /> 476<br />
477&#160; <strong>schemes</strong><br /> 477&#160; <strong>schemes</strong><br />
478&#160; Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs (or <span class="term">!</span>&#160;to <em>deny</em>&#160;any URL); <span class="term">&#42;</span>&#160;covers all unspecified attributes; see <a href="#s3.4.3">section 3.4.3</a><br /> 478&#160; Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs (or <span class="term">!</span>&#160;to <em>deny</em>&#160;any URL); <span class="term">&#42;</span>&#160;covers all unspecified attributes; see <a href="#s3.4.3">section 3.4.3</a><br />
479<br /> 479<br />
480&#160; <span class="term">href&#58; aim, app, feed, file, ftp, gopher, http, https, javascript, irc, mailto, news, nntp, sftp, ssh, tel, telnet; &#42;&#58;data, file, http, https, javascript</span>&#160; *^<br /> 480&#160; <span class="term">href&#58; aim, app, feed, file, ftp, gopher, http, https, javascript, irc, mailto, news, nntp, sftp, ssh, tel, telnet; &#42;&#58;data, file, http, https, javascript</span>&#160; *^<br />
481&#160; <span class="term">href&#58; aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style&#58; !; &#42;&#58;file, http, https</span>&#160; "<br /> 481&#160; <span class="term">href&#58; aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style&#58; !; &#42;&#58;file, http, https</span>&#160; "<br />
482<br /> 482<br />
483&#160; <strong>show_setting</strong><br /> 483&#160; <strong>show_setting</strong><br />
484&#160; Name of a PHP variable to assign the <em>finalized</em>&#160;<span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;values; see <a href="#s3.8">section 3.8</a><br /> 484&#160; Name of a PHP variable to assign the <em>finalized</em>&#160;<span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;values; see <a href="#s3.8">section 3.8</a><br />
485<br /> 485<br />
486&#160; <strong>style_pass</strong><br /> 486&#160; <strong>style_pass</strong><br />
487&#160; Ignore <span class="term">style</span>&#160;attribute values, letting them through without any alteration<br /> 487&#160; Ignore <span class="term">style</span>&#160;attribute values, letting them through without any alteration<br />
488<br /> 488<br />
489&#160; <span class="term">0</span>&#160;- no *<br /> 489&#160; <span class="term">0</span>&#160;- no *<br />
490&#160; <span class="term">1</span>&#160;- htmLawed will let through any <span class="term">style</span>&#160;value; see <a href="#s3.4.8">section 3.4.8</a><br /> 490&#160; <span class="term">1</span>&#160;- htmLawed will let through any <span class="term">style</span>&#160;value; see <a href="#s3.4.8">section 3.4.8</a><br />
491<br /> 491<br />
492&#160; <strong>tidy</strong><br /> 492&#160; <strong>tidy</strong><br />
493&#160; Beautify or compact HTML code; see <a href="#s3.3.5">section 3.3.5</a><br /> 493&#160; Beautify or compact HTML code; see <a href="#s3.3.5">section 3.3.5</a><br />
494<br /> 494<br />
495&#160; <span class="term">-1</span>&#160;- compact<br /> 495&#160; <span class="term">-1</span>&#160;- compact<br />
496&#160; <span class="term">0</span>&#160;- no &#160;*<br /> 496&#160; <span class="term">0</span>&#160;- no &#160;*<br />
497&#160; <span class="term">1</span>&#160;or <em>string</em>&#160;- beautify (custom format specified by <span class="term">string</span>)<br /> 497&#160; <span class="term">1</span>&#160;or <em>string</em>&#160;- beautify (custom format specified by <span class="term">string</span>)<br />
498<br /> 498<br />
499&#160; <strong>unique_ids</strong><br /> 499&#160; <strong>unique_ids</strong><br />
500&#160; <span class="term">id</span>&#160;attribute value checks; see <a href="#s3.4.2">section 3.4.2</a><br /> 500&#160; <span class="term">id</span>&#160;attribute value checks; see <a href="#s3.4.2">section 3.4.2</a><br />
501<br /> 501<br />
502&#160; <span class="term">0</span>&#160;- no<br /> 502&#160; <span class="term">0</span>&#160;- no<br />
503&#160; <span class="term">1</span>&#160;- remove duplicate and/or invalid ones &#160;*<br /> 503&#160; <span class="term">1</span>&#160;- remove duplicate and/or invalid ones &#160;*<br />
504&#160; <em>word</em>&#160;- remove invalid ones and replace duplicate ones with new and unique ones based on the <em>word</em>; the admin-specified <em>word</em>&#160;cannot contain a space character<br /> 504&#160; <em>word</em>&#160;- remove invalid ones and replace duplicate ones with new and unique ones based on the <em>word</em>; the admin-specified <em>word</em>&#160;cannot contain a space character<br />
505<br /> 505<br />
506&#160; <strong>valid_xhtml</strong><br /> 506&#160; <strong>valid_xhtml</strong><br />
507&#160; Magic parameter to make input the most valid XHTML without needing to specify other relevant <span class="term">$config</span>&#160;parameters; see <a href="#s3.5">section 3.5</a><br /> 507&#160; Magic parameter to make input the most valid XHTML without needing to specify other relevant <span class="term">$config</span>&#160;parameters; see <a href="#s3.5">section 3.5</a><br />
508<br /> 508<br />
509&#160; <span class="term">0</span>&#160;- no &#160;*<br /> 509&#160; <span class="term">0</span>&#160;- no &#160;*<br />
510&#160; <span class="term">1</span>&#160;- will auto-adjust other relevant <span class="term">$config</span>&#160;parameters (indicated by <span class="term">~</span>&#160;in this list)<br /> 510&#160; <span class="term">1</span>&#160;- will auto-adjust other relevant <span class="term">$config</span>&#160;parameters (indicated by <span class="term">~</span>&#160;in this list)<br />
511<br /> 511<br />
512&#160; <strong>xml:lang</strong><br /> 512&#160; <strong>xml:lang</strong><br />
513&#160; Auto-add <span class="term">xml&#58;lang</span>&#160;attribute; see <a href="#s3.4.1">section 3.4.1</a><br /> 513&#160; Auto-add <span class="term">xml&#58;lang</span>&#160;attribute; see <a href="#s3.4.1">section 3.4.1</a><br />
514<br /> 514<br />
515&#160; <span class="term">0</span>&#160;- no &#160;*<br /> 515&#160; <span class="term">0</span>&#160;- no &#160;*<br />
516&#160; <span class="term">1</span>&#160;- add if <span class="term">lang</span>&#160;attribute is present<br /> 516&#160; <span class="term">1</span>&#160;- add if <span class="term">lang</span>&#160;attribute is present<br />
517&#160; <span class="term">2</span>&#160;- add if <span class="term">lang</span>&#160;attribute is present, and remove <span class="term">lang</span>&#160; ~<br /> 517&#160; <span class="term">2</span>&#160;- add if <span class="term">lang</span>&#160;attribute is present, and remove <span class="term">lang</span>&#160; ~<br />
518 518
519</div> 519</div>
520<div class="sub-section"><h3> 520<div class="sub-section"><h3>
521<a name="s2.3" id="s2.3"></a><span class="item-no">2.3</span>&#160; Extra HTML specifications using the $spec parameter 521<a name="s2.3" id="s2.3"></a><span class="item-no">2.3</span>&#160; Extra HTML specifications using the $spec parameter
522</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 522</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
523<br /> 523<br />
524&#160; The <span class="term">$spec</span>&#160;argument of htmLawed can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policies. <span class="term">$spec</span>&#160;is specified as a string of text containing one or more <em>rules</em>, with multiple rules separated from each other by a semi-colon (<span class="term">;</span>). E.g.,<br /> 524&#160; The <span class="term">$spec</span>&#160;argument of htmLawed can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policies. <span class="term">$spec</span>&#160;is specified as a string of text containing one or more <em>rules</em>, with multiple rules separated from each other by a semi-colon (<span class="term">;</span>). E.g.,<br />
525<br /> 525<br />
526 526
527<code class="code">&#160; &#160; $spec = &#39;i=-&#42;; td, tr=style, id, -&#42;; a=id(match="/[a-z][a-z\d.&#58;\-&#96;"]&#42;/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt&#39;;</code> 527<code class="code">&#160; &#160; $spec = &#39;i=-&#42;; td, tr=style, id, -&#42;; a=id(match="/[a-z][a-z\d.&#58;\-&#96;"]&#42;/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt&#39;;</code>
528<br /> 528<br />
529 529
530<code class="code">&#160; &#160; $processed = htmLawed($text, $config, $spec);</code> 530<code class="code">&#160; &#160; $processed = htmLawed($text, $config, $spec);</code>
531<br /> 531<br />
532<br /> 532<br />
533&#160; Or,<br /> 533&#160; Or,<br />
534<br /> 534<br />
535 535
536<code class="code">&#160; &#160; $processed = htmLawed($text, $config, &#39;i=-&#42;; td, tr=style, id, -&#42;; a=id(match="/[a-z][a-z\d.&#58;\-&#96;"]&#42;/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt&#39;);</code> 536<code class="code">&#160; &#160; $processed = htmLawed($text, $config, &#39;i=-&#42;; td, tr=style, id, -&#42;; a=id(match="/[a-z][a-z\d.&#58;\-&#96;"]&#42;/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt&#39;);</code>
537<br /> 537<br />
538<br /> 538<br />
539&#160; A rule begins with an HTML <strong>element</strong>&#160;name(s) (<em>rule-element</em>), for which the rule applies, followed by an equal-to (=) sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., <span class="term">th,td,tr=</span>.<br /> 539&#160; A rule begins with an HTML <strong>element</strong>&#160;name(s) (<em>rule-element</em>), for which the rule applies, followed by an equal-to (=) sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., <span class="term">th,td,tr=</span>.<br />
540<br /> 540<br />
541&#160; Rest of the rule consists of comma-separated HTML <strong>attribute names</strong>. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., <span class="term">-width</span>. To deny all attributes, <span class="term">-&#42;</span>&#160;can be used.<br /> 541&#160; Rest of the rule consists of comma-separated HTML <strong>attribute names</strong>. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., <span class="term">-width</span>. To deny all attributes, <span class="term">-&#42;</span>&#160;can be used.<br />
542<br /> 542<br />
543&#160; Following shows examples of rule excerpts with rule-element <span class="term">a</span>&#160;and the attributes that are being permitted:<br /> 543&#160; Following shows examples of rule excerpts with rule-element <span class="term">a</span>&#160;and the attributes that are being permitted:<br />
544<br /> 544<br />
545&#160; * &#160;<span class="term">a=</span>&#160;- all<br /> 545&#160; * &#160;<span class="term">a=</span>&#160;- all<br />
546&#160; * &#160;<span class="term">a=id</span>&#160;- all<br /> 546&#160; * &#160;<span class="term">a=id</span>&#160;- all<br />
547&#160; * &#160;<span class="term">a=href, title, -id, -onclick</span>&#160;- all except <span class="term">id</span>&#160;and <span class="term">onclick</span><br /> 547&#160; * &#160;<span class="term">a=href, title, -id, -onclick</span>&#160;- all except <span class="term">id</span>&#160;and <span class="term">onclick</span><br />
548&#160; * &#160;<span class="term">a=&#42;, id, -id</span>&#160;- all except <span class="term">id</span><br /> 548&#160; * &#160;<span class="term">a=&#42;, id, -id</span>&#160;- all except <span class="term">id</span><br />
549&#160; * &#160;<span class="term">a=-&#42;</span>&#160;- none<br /> 549&#160; * &#160;<span class="term">a=-&#42;</span>&#160;- none<br />
550&#160; * &#160;<span class="term">a=-&#42;, href, title</span>&#160;- none except <span class="term">href</span>&#160;and <span class="term">title</span><br /> 550&#160; * &#160;<span class="term">a=-&#42;, href, title</span>&#160;- none except <span class="term">href</span>&#160;and <span class="term">title</span><br />
551&#160; * &#160;<span class="term">a=-&#42;, -id, href, title</span>&#160;- none except <span class="term">href</span>&#160;and <span class="term">title</span><br /> 551&#160; * &#160;<span class="term">a=-&#42;, -id, href, title</span>&#160;- none except <span class="term">href</span>&#160;and <span class="term">title</span><br />
552<br /> 552<br />
553&#160; Rules regarding <strong>attribute values</strong>&#160;are optionally specified inside round brackets after attribute names in solidus (/)-separated <em>parameter = value</em>&#160;pairs. E.g., <span class="term">title(maxlen=30/minlen=5)</span>. None or one or more of the following parameters may be specified:<br /> 553&#160; Rules regarding <strong>attribute values</strong>&#160;are optionally specified inside round brackets after attribute names in solidus (/)-separated <em>parameter = value</em>&#160;pairs. E.g., <span class="term">title(maxlen=30/minlen=5)</span>. None or one or more of the following parameters may be specified:<br />
554<br /> 554<br />
555&#160; * &#160;<span class="term">oneof</span>&#160;- one or more choices separated by <span class="term">|</span>&#160;that the value should match; if only one choice is provided, then the value must match that choice; matching is case-sensitive<br /> 555&#160; * &#160;<span class="term">oneof</span>&#160;- one or more choices separated by <span class="term">|</span>&#160;that the value should match; if only one choice is provided, then the value must match that choice; matching is case-sensitive<br />
556<br /> 556<br />
557&#160; * &#160;<span class="term">noneof</span>&#160;- one or more choices separated by <span class="term">|</span>&#160;that the value should not match; matching is case-sensitive<br /> 557&#160; * &#160;<span class="term">noneof</span>&#160;- one or more choices separated by <span class="term">|</span>&#160;that the value should not match; matching is case-sensitive<br />
558<br /> 558<br />
559&#160; * &#160;<span class="term">maxlen</span>&#160;and <span class="term">minlen</span>&#160;- upper and lower limits for the number of characters in the attribute value; specified in numbers<br /> 559&#160; * &#160;<span class="term">maxlen</span>&#160;and <span class="term">minlen</span>&#160;- upper and lower limits for the number of characters in the attribute value; specified in numbers<br />
560<br /> 560<br />
561&#160; * &#160;<span class="term">maxval</span>&#160;and <span class="term">minval</span>&#160;- upper and lower limits for the numerical value specified in the attribute value; specified in numbers<br /> 561&#160; * &#160;<span class="term">maxval</span>&#160;and <span class="term">minval</span>&#160;- upper and lower limits for the numerical value specified in the attribute value; specified in numbers<br />
562<br /> 562<br />
563&#160; * &#160;<span class="term">match</span>&#160;and <span class="term">nomatch</span>&#160;- pattern that the attribute value should or should not match; specified as PHP/PCRE-compatible regular expressions with delimiters and possibly modifiers (e.g., to specify case-sensitivity for matching)<br /> 563&#160; * &#160;<span class="term">match</span>&#160;and <span class="term">nomatch</span>&#160;- pattern that the attribute value should or should not match; specified as PHP/PCRE-compatible regular expressions with delimiters and possibly modifiers (e.g., to specify case-sensitivity for matching)<br />
564<br /> 564<br />
565&#160; * &#160;<span class="term">default</span>&#160;- a value to force on the attribute if the value provided by the writer does not fit any of the specified parameters<br /> 565&#160; * &#160;<span class="term">default</span>&#160;- a value to force on the attribute if the value provided by the writer does not fit any of the specified parameters<br />
566<br /> 566<br />
567&#160; If <span class="term">default</span>&#160;is not set and the attribute value does not satisfy any of the specified parameters, then the attribute is removed. The <span class="term">default</span>&#160;value can also be used to force all attribute declarations to take the same value (by getting the values declared illegal by setting, e.g., <span class="term">maxlen</span>&#160;to <span class="term">-1</span>).<br /> 567&#160; If <span class="term">default</span>&#160;is not set and the attribute value does not satisfy any of the specified parameters, then the attribute is removed. The <span class="term">default</span>&#160;value can also be used to force all attribute declarations to take the same value (by getting the values declared illegal by setting, e.g., <span class="term">maxlen</span>&#160;to <span class="term">-1</span>).<br />
568<br /> 568<br />
569&#160; Examples with <em>input</em>&#160;<span class="term">&lt;input title="WIDTH" value="10em" /&gt;&lt;input title="length" value="5" class="ic1 ic2" /&gt;</span>&#160;are shown below.<br /> 569&#160; Examples with <em>input</em>&#160;<span class="term">&lt;input title="WIDTH" value="10em" /&gt;&lt;input title="length" value="5" class="ic1 ic2" /&gt;</span>&#160;are shown below.<br />
570<br /> 570<br />
571&#160; <em>Rule</em>: <span class="term">input=title(maxlen=60/minlen=6), value</span><br /> 571&#160; <em>Rule</em>: <span class="term">input=title(maxlen=60/minlen=6), value</span><br />
572&#160; <em>Output</em>: <span class="term">&lt;input value="10em" /&gt;&lt;input title="length" value="5" class="ic1 ic2" /&gt;</span><br /> 572&#160; <em>Output</em>: <span class="term">&lt;input value="10em" /&gt;&lt;input title="length" value="5" class="ic1 ic2" /&gt;</span><br />
573<br /> 573<br />
574&#160; <em>Rule</em>: <span class="term">input=title(), value(maxval=8/default=6)</span><br /> 574&#160; <em>Rule</em>: <span class="term">input=title(), value(maxval=8/default=6)</span><br />
575&#160; <em>Output</em>: <span class="term">&lt;input title="WIDTH" value="6" /&gt;&lt;input title="length" value="5" class="ic1 ic2" /&gt;</span><br /> 575&#160; <em>Output</em>: <span class="term">&lt;input title="WIDTH" value="6" /&gt;&lt;input title="length" value="5" class="ic1 ic2" /&gt;</span><br />
576<br /> 576<br />
577&#160; <em>Rule</em>: <span class="term">input=title(nomatch=%w.d%i), value(match=%em%/default=6em)</span><br /> 577&#160; <em>Rule</em>: <span class="term">input=title(nomatch=%w.d%i), value(match=%em%/default=6em)</span><br />
578&#160; <em>Output</em>: <span class="term">&lt;input value="10em" /&gt;&lt;input title="length" value="6em" class="ic1 ic2" /&gt;</span><br /> 578&#160; <em>Output</em>: <span class="term">&lt;input value="10em" /&gt;&lt;input title="length" value="6em" class="ic1 ic2" /&gt;</span><br />
579<br /> 579<br />
580&#160; <em>Rule</em>: <span class="term">input=class(noneof=ic2|ic3/oneof=ic1|ic4), title(oneof=height|depth/default=depth), value(noneof=5|6)</span><br /> 580&#160; <em>Rule</em>: <span class="term">input=class(noneof=ic2|ic3/oneof=ic1|ic4), title(oneof=height|depth/default=depth), value(noneof=5|6)</span><br />
581&#160; <em>Output</em>: <span class="term">&lt;input title="depth" value="10em" /&gt;&lt;input title="depth" class="ic1" /&gt;</span><br /> 581&#160; <em>Output</em>: <span class="term">&lt;input title="depth" value="10em" /&gt;&lt;input title="depth" class="ic1" /&gt;</span><br />
582<br /> 582<br />
583&#160; <strong>Special characters</strong>: The characters <span class="term">;</span>, <span class="term">,</span>, <span class="term">/</span>, <span class="term">(</span>, <span class="term">)</span>, <span class="term">|</span>, <span class="term">~</span>&#160;and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be <em>escaped</em>&#160;by enclosing in pairs of double-quotes (<span class="term">"</span>). A back-tick (<span class="term">&#96;</span>) can be used to escape a literal <span class="term">"</span>. An example rule illustrating this is <span class="term">input=value(maxlen=30/match="/^\w/"/default="your &#96;"ID&#96;"")</span>.<br /> 583&#160; <strong>Special characters</strong>: The characters <span class="term">;</span>, <span class="term">,</span>, <span class="term">/</span>, <span class="term">(</span>, <span class="term">)</span>, <span class="term">|</span>, <span class="term">~</span>&#160;and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be <em>escaped</em>&#160;by enclosing in pairs of double-quotes (<span class="term">"</span>). A back-tick (<span class="term">&#96;</span>) can be used to escape a literal <span class="term">"</span>. An example rule illustrating this is <span class="term">input=value(maxlen=30/match="/^\w/"/default="your &#96;"ID&#96;"")</span>.<br />
584<br /> 584<br />
585&#160; <strong>Attributes that accept multiple values</strong>: If an attribute is <span class="term">accesskey</span>, <span class="term">class</span>, <span class="term">itemtype</span>&#160;or <span class="term">rel</span>, which can have multiple, space-separated values, or <span class="term">srcset</span>, which can have multiple, comma-separated values, htmLawed will parse the attribute value for such multiple values and will individually test each of them.<br /> 585&#160; <strong>Attributes that accept multiple values</strong>: If an attribute is <span class="term">accesskey</span>, <span class="term">class</span>, <span class="term">itemtype</span>&#160;or <span class="term">rel</span>, which can have multiple, space-separated values, or <span class="term">srcset</span>, which can have multiple, comma-separated values, htmLawed will parse the attribute value for such multiple values and will individually test each of them.<br />
586<br /> 586<br />
587&#160; <strong>Note</strong>: To deny an attribute for all elements for which it is legal, <span class="term">$config["deny_attribute"]</span>&#160;(see <a href="#s3.4">section 3.4</a>) can be used instead of <span class="term">$spec</span>. Also, attributes can be allowed element-specifically through <span class="term">$spec</span>&#160;while being denied globally through <span class="term">$config["deny_attribute"]</span>. The <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) can also be possibly used to implement a functionality like that achieved using <span class="term">$spec</span>&#160;functionality.<br /> 587&#160; <strong>Note</strong>: To deny an attribute for all elements for which it is legal, <span class="term">$config["deny_attribute"]</span>&#160;(see <a href="#s3.4">section 3.4</a>) can be used instead of <span class="term">$spec</span>. Also, attributes can be allowed element-specifically through <span class="term">$spec</span>&#160;while being denied globally through <span class="term">$config["deny_attribute"]</span>. The <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) can also be possibly used to implement a functionality like that achieved using <span class="term">$spec</span>&#160;functionality.<br />
588<br /> 588<br />
589&#160; <strong>Note</strong>: Attributes' specifications for an element may be set through multiple rules. In case of conflict, the attribute specification in the first rule will get precedence.<br /> 589&#160; <strong>Note</strong>: Attributes' specifications for an element may be set through multiple rules. In case of conflict, the attribute specification in the first rule will get precedence.<br />
590<br /> 590<br />
591&#160; <span class="term">$spec</span>&#160;can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of <span class="term">$spec</span>&#160;will permit the custom uses of the standard <span class="term">rel</span>&#160;attribute in <span class="term">input</span>&#160;(not permitted as per standards) and of a non-standard attribute, <span class="term">vFlag</span>, in <span class="term">img</span>.<br /> 591&#160; <span class="term">$spec</span>&#160;can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of <span class="term">$spec</span>&#160;will permit the custom uses of the standard <span class="term">rel</span>&#160;attribute in <span class="term">input</span>&#160;(not permitted as per standards) and of a non-standard attribute, <span class="term">vFlag</span>, in <span class="term">img</span>.<br />
592<br /> 592<br />
593 593
594<code class="code">&#160; &#160; $spec = &#39;img=vFlag; input=rel&#39;</code> 594<code class="code">&#160; &#160; $spec = &#39;img=vFlag; input=rel&#39;</code>
595<br /> 595<br />
596<br /> 596<br />
597&#160; The attribute names must begin with an alphabet and cannot have space, equal-to (=) and solidus (/) characters.<br /> 597&#160; The attribute names must begin with an alphabet and cannot have space, equal-to (=) and solidus (/) characters.<br />
598 598
599</div> 599</div>
600<div class="sub-section"><h3> 600<div class="sub-section"><h3>
601<a name="s2.4" id="s2.4"></a><span class="item-no">2.4</span>&#160; Performance time &amp; memory usage 601<a name="s2.4" id="s2.4"></a><span class="item-no">2.4</span>&#160; Performance time &amp; memory usage
602</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 602</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
603<br /> 603<br />
604&#160; The time and memory consumed during text processing by htmLawed depends on its configuration, the size of the input, and the amount, nestedness and well-formedness of the HTML markup within the input. In particular, tag balancing and beautification each can increase the processing time by about a quarter.<br /> 604&#160; The time and memory consumed during text processing by htmLawed depends on its configuration, the size of the input, and the amount, nestedness and well-formedness of the HTML markup within the input. In particular, tag balancing and beautification each can increase the processing time by about a quarter.<br />
605<br /> 605<br />
606&#160; The htmLawed <a href="htmLawedTest.php">demo</a>&#160;can be used to evaluate the performance and effects of different types of input and <span class="term">$config</span>.<br /> 606&#160; The htmLawed <a href="htmLawedTest.php">demo</a>&#160;can be used to evaluate the performance and effects of different types of input and <span class="term">$config</span>.<br />
607 607
608</div> 608</div>
609<div class="sub-section"><h3> 609<div class="sub-section"><h3>
610<a name="s2.5" id="s2.5"></a><span class="item-no">2.5</span>&#160; Some security risks to keep in mind 610<a name="s2.5" id="s2.5"></a><span class="item-no">2.5</span>&#160; Some security risks to keep in mind
611</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 611</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
612<br /> 612<br />
613&#160; When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially <em>dangerous</em>&#160;HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks:<br /> 613&#160; When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially <em>dangerous</em>&#160;HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks:<br />
614<br /> 614<br />
615&#160; * &#160;Allowing <span class="term">script</span>, <span class="term">applet</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">canvas</span>, <span class="term">audio</span>, <span class="term">video</span>&#160;or <span class="term">object</span>&#160;elements, or certain of their attributes like <span class="term">allowscriptaccess</span><br /> 615&#160; * &#160;Allowing <span class="term">script</span>, <span class="term">applet</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">canvas</span>, <span class="term">audio</span>, <span class="term">video</span>&#160;or <span class="term">object</span>&#160;elements, or certain of their attributes like <span class="term">allowscriptaccess</span><br />
616<br /> 616<br />
617&#160; * &#160;Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., <span class="term">&lt;!--[if gte IE 4]&gt;&lt;script&gt;alert("xss");&lt;/script&gt;&lt;![endif]--&gt;</span><br /> 617&#160; * &#160;Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., <span class="term">&lt;!--[if gte IE 4]&gt;&lt;script&gt;alert("xss");&lt;/script&gt;&lt;![endif]--&gt;</span><br />
618<br /> 618<br />
619&#160; * &#160;Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable)<br /> 619&#160; * &#160;Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable)<br />
620<br /> 620<br />
621&#160; * &#160;Allowing the <span class="term">style</span>&#160;attribute<br /> 621&#160; * &#160;Allowing the <span class="term">style</span>&#160;attribute<br />
622<br /> 622<br />
623&#160; To remove <em>unsecure</em>&#160;HTML, code-developers using htmLawed must set <span class="term">$config</span>&#160;appropriately. E.g., <span class="term">$config["elements"] = "&#42; -script"</span>&#160;to deny the <span class="term">script</span>&#160;element (<a href="#s3.3">section 3.3</a>), <span class="term">$config["safe"] = 1</span>&#160;to auto-configure ceratin htmLawed parameters for maximizing security (<a href="#s3.6">section 3.6</a>), etc.<br /> 623&#160; To remove <em>unsecure</em>&#160;HTML, code-developers using htmLawed must set <span class="term">$config</span>&#160;appropriately. E.g., <span class="term">$config["elements"] = "&#42; -script"</span>&#160;to deny the <span class="term">script</span>&#160;element (<a href="#s3.3">section 3.3</a>), <span class="term">$config["safe"] = 1</span>&#160;to auto-configure ceratin htmLawed parameters for maximizing security (<a href="#s3.6">section 3.6</a>), etc.<br />
624<br /> 624<br />
625&#160; Permitting the <span class="term">&#42;style&#42;</span>&#160;attribute brings in risks of <em>click-jacking</em>, <em>phishing</em>, web-page overlays, etc., <em>even</em>&#160;when the <span class="term">safe</span>&#160;parameter is enabled (see <a href="#s3.6">section 3.6</a>). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's <span class="term">$spec</span>&#160;argument, and through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.8">section 3.4.8</a>&#160;for more). Disallowing <span class="term">style</span>&#160;completely and relying on CSS classes and stylesheet files is recommended.<br /> 625&#160; Permitting the <span class="term">&#42;style&#42;</span>&#160;attribute brings in risks of <em>click-jacking</em>, <em>phishing</em>, web-page overlays, etc., <em>even</em>&#160;when the <span class="term">safe</span>&#160;parameter is enabled (see <a href="#s3.6">section 3.6</a>). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's <span class="term">$spec</span>&#160;argument, and through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.8">section 3.4.8</a>&#160;for more). Disallowing <span class="term">style</span>&#160;completely and relying on CSS classes and stylesheet files is recommended.<br />
626<br /> 626<br />
627&#160; htmLawed does not check or correct the character <strong>encoding</strong>&#160;of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML <span class="term">meta</span>&#160;tags, this can allow for an exploit (like Google's <em>UTF-7/XSS</em>&#160;vulnerability of the past).<br /> 627&#160; htmLawed does not check or correct the character <strong>encoding</strong>&#160;of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML <span class="term">meta</span>&#160;tags, this can allow for an exploit (like Google's <em>UTF-7/XSS</em>&#160;vulnerability of the past).<br />
628<br /> 628<br />
629&#160; Ocassionally, though very rarely, the default settings with which htmLawed runs may change between different versions of htmLawed. Admins should keep this in mind when upgrading htmLawed. Important changes in htmLawed's default behavior in new releases of the software are noted in <a href="#s4.5">section 4.5</a>&#160;on upgrades.<br /> 629&#160; Ocassionally, though very rarely, the default settings with which htmLawed runs may change between different versions of htmLawed. Admins should keep this in mind when upgrading htmLawed. Important changes in htmLawed's default behavior in new releases of the software are noted in <a href="#s4.5">section 4.5</a>&#160;on upgrades.<br />
630 630
631</div> 631</div>
632<div class="sub-section"><h3> 632<div class="sub-section"><h3>
633<a name="s2.6" id="s2.6"></a><span class="item-no">2.6</span>&#160; Use with <span class="term">kses()</span>&#160;code 633<a name="s2.6" id="s2.6"></a><span class="item-no">2.6</span>&#160; Use with <span class="term">kses()</span>&#160;code
634</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 634</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
635<br /> 635<br />
636&#160; The <span class="term">Kses</span>&#160;PHP script for HTML filtering is used by many applications (like <span class="term">WordPress</span>, as in year 2012). It is possible to have such applications use htmLawed instead, since it is compatible with code that calls the <span class="term">kses()</span>&#160;function declared in the <span class="term">Kses</span>&#160;file (usually named <span class="term">kses.php</span>). E.g., application code like this will continue to work after replacing <span class="term">Kses</span>&#160;with htmLawed:<br /> 636&#160; The <span class="term">Kses</span>&#160;PHP script for HTML filtering is used by many applications (like <span class="term">WordPress</span>, as in year 2012). It is possible to have such applications use htmLawed instead, since it is compatible with code that calls the <span class="term">kses()</span>&#160;function declared in the <span class="term">Kses</span>&#160;file (usually named <span class="term">kses.php</span>). E.g., application code like this will continue to work after replacing <span class="term">Kses</span>&#160;with htmLawed:<br />
637<br /> 637<br />
638 638
639<code class="code">&#160; &#160; $comment_filtered = kses($comment_input, array(&#39;a&#39;=&gt;array(), &#39;b&#39;=&gt;array(), &#39;i&#39;=&gt;array()));</code> 639<code class="code">&#160; &#160; $comment_filtered = kses($comment_input, array(&#39;a&#39;=&gt;array(), &#39;b&#39;=&gt;array(), &#39;i&#39;=&gt;array()));</code>
640<br /> 640<br />
641<br /> 641<br />
642&#160; If the application uses a <span class="term">Kses</span>&#160;file that has the <span class="term">kses()</span>&#160;function declared, then, to have the application use htmLawed instead of <span class="term">Kses</span>, rename <span class="term">htmLawed.php</span>&#160;(to <span class="term">kses.php</span>, e.g.) and replace the <span class="term">Kses</span>&#160;file (or just replace the code in the <span class="term">Kses</span>&#160;file with the htmLawed code). If the <span class="term">kses()</span>&#160;function in the <span class="term">Kses</span>&#160;file had been renamed by the application developer (e.g., in <span class="term">WordPress</span>, it is named <span class="term">wp_kses()</span>), then appropriately rename the <span class="term">kses()</span>&#160;function in the htmLawed code. Then, add the following code (which was a part of htmLawed prior to version 1.2):<br /> 642&#160; If the application uses a <span class="term">Kses</span>&#160;file that has the <span class="term">kses()</span>&#160;function declared, then, to have the application use htmLawed instead of <span class="term">Kses</span>, rename <span class="term">htmLawed.php</span>&#160;(to <span class="term">kses.php</span>, e.g.) and replace the <span class="term">Kses</span>&#160;file (or just replace the code in the <span class="term">Kses</span>&#160;file with the htmLawed code). If the <span class="term">kses()</span>&#160;function in the <span class="term">Kses</span>&#160;file had been renamed by the application developer (e.g., in <span class="term">WordPress</span>, it is named <span class="term">wp_kses()</span>), then appropriately rename the <span class="term">kses()</span>&#160;function in the htmLawed code. Then, add the following code (which was a part of htmLawed prior to version 1.2):<br />
643<br /> 643<br />
644 644
645<code class="code">&#160; &#160; // kses compatibility</code> 645<code class="code">&#160; &#160; // kses compatibility</code>
646<br /> 646<br />
647 647
648<code class="code">&#160; &#160; function kses($t, $h, $p=array(&#39;http&#39;, &#39;https&#39;, &#39;ftp&#39;, &#39;news&#39;, &#39;nntp&#39;, &#39;telnet&#39;, &#39;gopher&#39;, &#39;mailto&#39;)){</code> 648<code class="code">&#160; &#160; function kses($t, $h, $p=array(&#39;http&#39;, &#39;https&#39;, &#39;ftp&#39;, &#39;news&#39;, &#39;nntp&#39;, &#39;telnet&#39;, &#39;gopher&#39;, &#39;mailto&#39;)){</code>
649<br /> 649<br />
650 650
651<code class="code">&#160; &#160; &#160;foreach($h as $k=&gt;$v){</code> 651<code class="code">&#160; &#160; &#160;foreach($h as $k=&gt;$v){</code>
652<br /> 652<br />
653 653
654<code class="code">&#160; &#160; &#160; $h[$k][&#39;n&#39;][&#39;&#42;&#39;] = 1;</code> 654<code class="code">&#160; &#160; &#160; $h[$k][&#39;n&#39;][&#39;&#42;&#39;] = 1;</code>
655<br /> 655<br />
656 656
657<code class="code">&#160; &#160; &#160;}</code> 657<code class="code">&#160; &#160; &#160;}</code>
658<br /> 658<br />
659 659
660<code class="code">&#160; &#160; &#160;$C[&#39;cdata&#39;] = $C[&#39;comment&#39;] = $C[&#39;make_tag_strict&#39;] = $C[&#39;no_deprecated_attr&#39;] = $C[&#39;unique_ids&#39;] = 0;</code> 660<code class="code">&#160; &#160; &#160;$C[&#39;cdata&#39;] = $C[&#39;comment&#39;] = $C[&#39;make_tag_strict&#39;] = $C[&#39;no_deprecated_attr&#39;] = $C[&#39;unique_ids&#39;] = 0;</code>
661<br /> 661<br />
662 662
663<code class="code">&#160; &#160; &#160;$C[&#39;keep_bad&#39;] = 1;</code> 663<code class="code">&#160; &#160; &#160;$C[&#39;keep_bad&#39;] = 1;</code>
664<br /> 664<br />
665 665
666<code class="code">&#160; &#160; &#160;$C[&#39;elements&#39;] = count($h) ? strtolower(implode(&#39;,&#39;, array_keys($h))) &#58; &#39;-&#42;&#39;;</code> 666<code class="code">&#160; &#160; &#160;$C[&#39;elements&#39;] = count($h) ? strtolower(implode(&#39;,&#39;, array_keys($h))) &#58; &#39;-&#42;&#39;;</code>
667<br /> 667<br />
668 668
669<code class="code">&#160; &#160; &#160;$C[&#39;hook&#39;] = &#39;kses_hook&#39;;</code> 669<code class="code">&#160; &#160; &#160;$C[&#39;hook&#39;] = &#39;kses_hook&#39;;</code>
670<br /> 670<br />
671 671
672<code class="code">&#160; &#160; &#160;$C[&#39;schemes&#39;] = &#39;&#42;&#58;&#39;. implode(&#39;,&#39;, $p);</code> 672<code class="code">&#160; &#160; &#160;$C[&#39;schemes&#39;] = &#39;&#42;&#58;&#39;. implode(&#39;,&#39;, $p);</code>
673<br /> 673<br />
674 674
675<code class="code">&#160; &#160; &#160;return htmLawed($t, $C, $h);</code> 675<code class="code">&#160; &#160; &#160;return htmLawed($t, $C, $h);</code>
676<br /> 676<br />
677 677
678<code class="code">&#160; &#160; &#160;}</code> 678<code class="code">&#160; &#160; &#160;}</code>
679<br /> 679<br />
680<br /> 680<br />
681 681
682<code class="code">&#160; &#160; function kses_hook($t, &amp;$C, &amp;$S){</code> 682<code class="code">&#160; &#160; function kses_hook($t, &amp;$C, &amp;$S){</code>
683<br /> 683<br />
684 684
685<code class="code">&#160; &#160; &#160;return $t;</code> 685<code class="code">&#160; &#160; &#160;return $t;</code>
686<br /> 686<br />
687 687
688<code class="code">&#160; &#160; }</code> 688<code class="code">&#160; &#160; }</code>
689<br /> 689<br />
690<br /> 690<br />
691&#160; If the <span class="term">Kses</span>&#160;file used by the application has been significantly altered by the application developers, then one may need a different approach. E.g., with <span class="term">WordPress</span>&#160;(as in the year 2012), it is best to copy the htmLawed code, along with the above-mentioned additions, to <span class="term">wp_includes/kses.php</span>, rename the newly added function <span class="term">kses()</span>&#160;to <span class="term">wp_kses()</span>, and delete the code for the original <span class="term">wp_kses()</span>&#160;function.<br /> 691&#160; If the <span class="term">Kses</span>&#160;file used by the application has been significantly altered by the application developers, then one may need a different approach. E.g., with <span class="term">WordPress</span>&#160;(as in the year 2012), it is best to copy the htmLawed code, along with the above-mentioned additions, to <span class="term">wp_includes/kses.php</span>, rename the newly added function <span class="term">kses()</span>&#160;to <span class="term">wp_kses()</span>, and delete the code for the original <span class="term">wp_kses()</span>&#160;function.<br />
692<br /> 692<br />
693&#160; If the <span class="term">Kses</span>&#160;code has a non-empty hook function (e.g., <span class="term">wp_kses_hook()</span>&#160;in case of <span class="term">WordPress</span>), then the code for htmLawed's <span class="term">kses_hook()</span>&#160;function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With <span class="term">WordPress</span>, the hook function is an essential one. The following code is suggested for the htmLawed <span class="term">kses_hook()</span>&#160;in case of <span class="term">WordPress</span>:<br /> 693&#160; If the <span class="term">Kses</span>&#160;code has a non-empty hook function (e.g., <span class="term">wp_kses_hook()</span>&#160;in case of <span class="term">WordPress</span>), then the code for htmLawed's <span class="term">kses_hook()</span>&#160;function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With <span class="term">WordPress</span>, the hook function is an essential one. The following code is suggested for the htmLawed <span class="term">kses_hook()</span>&#160;in case of <span class="term">WordPress</span>:<br />
694<br /> 694<br />
695 695
696<code class="code">&#160; &#160; // kses compatibility</code> 696<code class="code">&#160; &#160; // kses compatibility</code>
697<br /> 697<br />
698 698
699<code class="code">&#160; &#160; function kses_hook($string, &amp;$cf, &amp;$spec){</code> 699<code class="code">&#160; &#160; function kses_hook($string, &amp;$cf, &amp;$spec){</code>
700<br /> 700<br />
701 701
702<code class="code">&#160; &#160; &#160;$allowed_html = $spec;</code> 702<code class="code">&#160; &#160; &#160;$allowed_html = $spec;</code>
703<br /> 703<br />
704 704
705<code class="code">&#160; &#160; &#160;$allowed_protocols = array();</code> 705<code class="code">&#160; &#160; &#160;$allowed_protocols = array();</code>
706<br /> 706<br />
707 707
708<code class="code">&#160; &#160; &#160;foreach($cf[&#39;schemes&#39;] as $v){</code> 708<code class="code">&#160; &#160; &#160;foreach($cf[&#39;schemes&#39;] as $v){</code>
709<br /> 709<br />
710 710
711<code class="code">&#160; &#160; &#160; foreach($v as $k2=&gt;$v2){</code> 711<code class="code">&#160; &#160; &#160; foreach($v as $k2=&gt;$v2){</code>
712<br /> 712<br />
713 713
714<code class="code">&#160; &#160; &#160; &#160;if(!in_array($k2, $allowed_protocols)){</code> 714<code class="code">&#160; &#160; &#160; &#160;if(!in_array($k2, $allowed_protocols)){</code>
715<br /> 715<br />
716 716
717<code class="code">&#160; &#160; &#160; &#160; $allowed_protocols[] = $k2;</code> 717<code class="code">&#160; &#160; &#160; &#160; $allowed_protocols[] = $k2;</code>
718<br /> 718<br />
719 719
720<code class="code">&#160; &#160; &#160; &#160;}</code> 720<code class="code">&#160; &#160; &#160; &#160;}</code>
721<br /> 721<br />
722 722
723<code class="code">&#160; &#160; &#160; }</code> 723<code class="code">&#160; &#160; &#160; }</code>
724<br /> 724<br />
725 725
726<code class="code">&#160; &#160; &#160;}</code> 726<code class="code">&#160; &#160; &#160;}</code>
727<br /> 727<br />
728 728
729<code class="code">&#160; &#160; &#160;return wp_kses_hook($string, $allowed_html, $allowed_protocols);</code> 729<code class="code">&#160; &#160; &#160;return wp_kses_hook($string, $allowed_html, $allowed_protocols);</code>
730<br /> 730<br />
731 731
732<code class="code">&#160; &#160; }</code> 732<code class="code">&#160; &#160; }</code>
733<br /> 733<br />
734 734
735</div> 735</div>
736<div class="sub-section"><h3> 736<div class="sub-section"><h3>
737<a name="s2.7" id="s2.7"></a><span class="item-no">2.7</span>&#160; Tolerance for ill-written HTML 737<a name="s2.7" id="s2.7"></a><span class="item-no">2.7</span>&#160; Tolerance for ill-written HTML
738</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 738</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
739<br /> 739<br />
740&#160; htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be <em>read</em>&#160;as HTML, and may therefore get identified as mere plain text. Following statements indicate the degree of <em>looseness</em>&#160;that htmLawed can work with, and can be provided in instructions to writers:<br /> 740&#160; htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be <em>read</em>&#160;as HTML, and may therefore get identified as mere plain text. Following statements indicate the degree of <em>looseness</em>&#160;that htmLawed can work with, and can be provided in instructions to writers:<br />
741<br /> 741<br />
742&#160; * &#160;Tags must be flanked by <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;with no <span class="term">&gt;</span>&#160;inside -- any needed <span class="term">&gt;</span>&#160;should be put in as <span class="term">&amp;gt;</span>. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and <span class="term">&gt;</span>, like <span class="term">&lt;div &gt;</span>&#160;and <span class="term">&lt;img / &gt;</span>, but not after the <span class="term">&lt;</span>.<br /> 742&#160; * &#160;Tags must be flanked by <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;with no <span class="term">&gt;</span>&#160;inside -- any needed <span class="term">&gt;</span>&#160;should be put in as <span class="term">&amp;gt;</span>. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and <span class="term">&gt;</span>, like <span class="term">&lt;div &gt;</span>&#160;and <span class="term">&lt;img / &gt;</span>, but not after the <span class="term">&lt;</span>.<br />
743<br /> 743<br />
744&#160; * &#160;Element and attribute names need not be lower-cased.<br /> 744&#160; * &#160;Element and attribute names need not be lower-cased.<br />
745<br /> 745<br />
746&#160; * &#160;Attribute string of elements may be liberally spaced with tabs, line-breaks, etc.<br /> 746&#160; * &#160;Attribute string of elements may be liberally spaced with tabs, line-breaks, etc.<br />
747<br /> 747<br />
748&#160; * &#160;Attribute values may be single- and not double-quoted.<br /> 748&#160; * &#160;Attribute values may be single- and not double-quoted.<br />
749<br /> 749<br />
750&#160; * &#160;Left-padding of numeric entities (like, <span class="term">&amp;#0160;</span>, <span class="term">&amp;x07ff;</span>) with <span class="term">0</span>&#160;is okay as long as the number of characters between between the <span class="term">&amp;</span>&#160;and the <span class="term">;</span>&#160;does not exceed 8. All entities must end with <span class="term">;</span>&#160;though.<br /> 750&#160; * &#160;Left-padding of numeric entities (like, <span class="term">&amp;#0160;</span>, <span class="term">&amp;x07ff;</span>) with <span class="term">0</span>&#160;is okay as long as the number of characters between between the <span class="term">&amp;</span>&#160;and the <span class="term">;</span>&#160;does not exceed 8. All entities must end with <span class="term">;</span>&#160;though.<br />
751<br /> 751<br />
752&#160; * &#160;Named character entities must be properly cased. Thus, <span class="term">&amp;Lt;</span>&#160;or <span class="term">&amp;TILDE;</span>&#160;will not be recognized as entities and will be <em>neutralized</em>.<br /> 752&#160; * &#160;Named character entities must be properly cased. Thus, <span class="term">&amp;Lt;</span>&#160;or <span class="term">&amp;TILDE;</span>&#160;will not be recognized as entities and will be <em>neutralized</em>.<br />
753<br /> 753<br />
754&#160; * &#160;HTML comments should not be inside element tags (they can be between tags), and should begin with <span class="term">&lt;!--</span>&#160;and end with <span class="term">--&gt;</span>. Characters like <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">--&gt;</span>&#160;inside should be put in as <span class="term">--&amp;gt;</span>. Any <span class="term">--</span>&#160;inside will be automatically converted to <span class="term">-</span>, and a space will be added before the <span class="term">--&gt;</span>&#160;comment-closing marker &#160;unless <span class="term">$config["comments"]</span>&#160;is set to <span class="term">4</span>&#160;(<a href="#s3.3.1">section 3.3.1</a>).<br /> 754&#160; * &#160;HTML comments should not be inside element tags (they can be between tags), and should begin with <span class="term">&lt;!--</span>&#160;and end with <span class="term">--&gt;</span>. Characters like <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">--&gt;</span>&#160;inside should be put in as <span class="term">--&amp;gt;</span>. Any <span class="term">--</span>&#160;inside will be automatically converted to <span class="term">-</span>, and a space will be added before the <span class="term">--&gt;</span>&#160;comment-closing marker &#160;unless <span class="term">$config["comments"]</span>&#160;is set to <span class="term">4</span>&#160;(<a href="#s3.3.1">section 3.3.1</a>).<br />
755<br /> 755<br />
756&#160; * &#160;<span class="term">CDATA</span>&#160;sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with <span class="term">&lt;[CDATA[</span>&#160;and end with <span class="term">]]&gt;</span>. Characters like <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">]]&gt;</span>&#160;inside should be put in as <span class="term">]]&amp;gt;</span>.<br /> 756&#160; * &#160;<span class="term">CDATA</span>&#160;sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with <span class="term">&lt;[CDATA[</span>&#160;and end with <span class="term">]]&gt;</span>. Characters like <span class="term">&lt;</span>, <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;may be allowed inside depending on <span class="term">$config</span>, but any <span class="term">]]&gt;</span>&#160;inside should be put in as <span class="term">]]&amp;gt;</span>.<br />
757<br /> 757<br />
758&#160; * &#160;For attribute values, character entities <span class="term">&amp;lt;</span>, <span class="term">&amp;gt;</span>&#160;and <span class="term">&amp;amp;</span>&#160;should be used instead of characters <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;(when <span class="term">&amp;</span>&#160;is not part of a character entity). This applies even for Javascript code in values of attributes like <span class="term">onclick</span>.<br /> 758&#160; * &#160;For attribute values, character entities <span class="term">&amp;lt;</span>, <span class="term">&amp;gt;</span>&#160;and <span class="term">&amp;amp;</span>&#160;should be used instead of characters <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, and <span class="term">&amp;</span>&#160;(when <span class="term">&amp;</span>&#160;is not part of a character entity). This applies even for Javascript code in values of attributes like <span class="term">onclick</span>.<br />
759<br /> 759<br />
760&#160; * &#160;Characters <span class="term">&lt;</span>, <span class="term">&gt;</span>, <span class="term">&amp;</span>&#160;and <span class="term">"</span>&#160;that are part of actual Javascript, etc., code in <span class="term">script</span>&#160;elements should be used as such and not be put in as entities like <span class="term">&amp;gt;</span>. Otherwise, though the HTML will be valid, the code may fail to work. Further, if such characters have to be used, then they should be put inside <span class="term">CDATA</span>&#160;sections.<br /> 760&#160; * &#160;Characters <span class="term">&lt;</span>, <span class="term">&gt;</span>, <span class="term">&amp;</span>&#160;and <span class="term">"</span>&#160;that are part of actual Javascript, etc., code in <span class="term">script</span>&#160;elements should be used as such and not be put in as entities like <span class="term">&amp;gt;</span>. Otherwise, though the HTML will be valid, the code may fail to work. Further, if such characters have to be used, then they should be put inside <span class="term">CDATA</span>&#160;sections.<br />
761<br /> 761<br />
762&#160; * &#160;Simple instructions like "an opening tag cannot be present between two closing tags" and "nested elements should be closed in the reverse order of how they were opened" can help authors write balanced HTML. If tags are imbalanced, htmLawed will try to balance them, but in the process, depending on <span class="term">$config["keep_bad"]</span>, some code/text may be lost.<br /> 762&#160; * &#160;Simple instructions like "an opening tag cannot be present between two closing tags" and "nested elements should be closed in the reverse order of how they were opened" can help authors write balanced HTML. If tags are imbalanced, htmLawed will try to balance them, but in the process, depending on <span class="term">$config["keep_bad"]</span>, some code/text may be lost.<br />
763<br /> 763<br />
764&#160; * &#160;Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc.<br /> 764&#160; * &#160;Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc.<br />
765<br /> 765<br />
766&#160; * &#160;With <span class="term">$config["unique_ids"]</span>&#160;not <span class="term">0</span>&#160;and the <span class="term">id</span>&#160;attribute being permitted, writers should carefully avoid using duplicate or invalid <span class="term">id</span>&#160;values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when <span class="term">&lt;a id="home"&gt;&lt;/a&gt;&lt;input id="home" /&gt;&lt;label for="home"&gt;&lt;/label&gt;</span>&#160;is processed into<br /> 766&#160; * &#160;With <span class="term">$config["unique_ids"]</span>&#160;not <span class="term">0</span>&#160;and the <span class="term">id</span>&#160;attribute being permitted, writers should carefully avoid using duplicate or invalid <span class="term">id</span>&#160;values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when <span class="term">&lt;a id="home"&gt;&lt;/a&gt;&lt;input id="home" /&gt;&lt;label for="home"&gt;&lt;/label&gt;</span>&#160;is processed into<br />
767<span class="term">&lt;a id="home"&gt;&lt;/a&gt;&lt;input id="prefix_home" /&gt;&lt;label for="home"&gt;&lt;/label&gt;</span>.<br /> 767<span class="term">&lt;a id="home"&gt;&lt;/a&gt;&lt;input id="prefix_home" /&gt;&lt;label for="home"&gt;&lt;/label&gt;</span>.<br />
768<br /> 768<br />
769&#160; * &#160;Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant.<br /> 769&#160; * &#160;Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant.<br />
770<br /> 770<br />
771&#160; * &#160;For URLs, unless <span class="term">$config["scheme"]</span>&#160;is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., <span class="term">htt&amp;#112;</span>&#160;(which many browsers will read as the harmless <span class="term">http</span>) may be considered bad by htmLawed.<br /> 771&#160; * &#160;For URLs, unless <span class="term">$config["scheme"]</span>&#160;is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., <span class="term">htt&amp;#112;</span>&#160;(which many browsers will read as the harmless <span class="term">http</span>) may be considered bad by htmLawed.<br />
772<br /> 772<br />
773&#160; * &#160;htmLawed will attempt to put plain text present directly inside <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span>&#160;elements (illegal as per the specifications) inside auto-generated <span class="term">div</span>&#160;elements during tag balancing (<a href="#s3.3.3">section 3.3.3</a>).<br /> 773&#160; * &#160;htmLawed will attempt to put plain text present directly inside <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span>&#160;elements (illegal as per the specifications) inside auto-generated <span class="term">div</span>&#160;elements during tag balancing (<a href="#s3.3.3">section 3.3.3</a>).<br />
774 774
775</div> 775</div>
776<div class="sub-section"><h3> 776<div class="sub-section"><h3>
777<a name="s2.8" id="s2.8"></a><span class="item-no">2.8</span>&#160; Limitations &amp; work-arounds 777<a name="s2.8" id="s2.8"></a><span class="item-no">2.8</span>&#160; Limitations &amp; work-arounds
778</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 778</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
779<br /> 779<br />
780&#160; htmLawed's main objective is to make the input text <em>more</em>&#160;standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with possible work-arounds.<br /> 780&#160; htmLawed's main objective is to make the input text <em>more</em>&#160;standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with possible work-arounds.<br />
781<br /> 781<br />
782&#160; It should be borne in mind that no browser application is 100% standard-compliant, standard specifications continue to evolve, and many browsers accept commonly used non-standard HTML. Regarding security, note that <em>unsafe</em>&#160;HTML code is not legally invalid per se.<br /> 782&#160; It should be borne in mind that no browser application is 100% standard-compliant, standard specifications continue to evolve, and many browsers accept commonly used non-standard HTML. Regarding security, note that <em>unsafe</em>&#160;HTML code is not legally invalid per se.<br />
783<br /> 783<br />
784&#160; * &#160;By default, htmLawed will not strictly adhere to the <em>current</em>&#160;HTML standard. Admins can configure htmLawed to be more strict about standard compliance. Standard specification for HTML is continuously evolving. There are two bodies (<a href="http://www.w3c.org">W3C</a>&#160;and <a href="http://www.whatwg.org">WHATWG</a>) that specify the standard and their specifications are not identical. E.g., as in mid-2013, the <span class="term">border</span>&#160;attribute is valid in <span class="term">table</span>&#160;as per W3C but not WHATWG. Thus, htmLawed may not be fully compliant with the standard of a specific group. The HTML standards/rules that htmLawed uses in its logic are a mix of the W3C and WHATWG standards, and can be lax because of the laxity of HTML interpreters (browsers) regarding standards.<br /> 784&#160; * &#160;By default, htmLawed will not strictly adhere to the <em>current</em>&#160;HTML standard. Admins can configure htmLawed to be more strict about standard compliance. Standard specification for HTML is continuously evolving. There are two bodies (<a href="http://www.w3c.org">W3C</a>&#160;and <a href="http://www.whatwg.org">WHATWG</a>) that specify the standard and their specifications are not identical. E.g., as in mid-2013, the <span class="term">border</span>&#160;attribute is valid in <span class="term">table</span>&#160;as per W3C but not WHATWG. Thus, htmLawed may not be fully compliant with the standard of a specific group. The HTML standards/rules that htmLawed uses in its logic are a mix of the W3C and WHATWG standards, and can be lax because of the laxity of HTML interpreters (browsers) regarding standards.<br />
785<br /> 785<br />
786&#160; * &#160;In general, htmLawed processes input to generate output that is most likely to be standard-compatible in most users' browsers. Thus, for example, it does not enforce the required value of <span class="term">0</span>&#160;on <span class="term">border</span>&#160;attribute of <span class="term">img</span>&#160;(an HTML version 5 specification).<br /> 786&#160; * &#160;In general, htmLawed processes input to generate output that is most likely to be standard-compatible in most users' browsers. Thus, for example, it does not enforce the required value of <span class="term">0</span>&#160;on <span class="term">border</span>&#160;attribute of <span class="term">img</span>&#160;(an HTML version 5 specification).<br />
787<br /> 787<br />
788&#160; * &#160;htmLawed is meant for input that goes into the <span class="term">body</span>&#160;of HTML documents. HTML's head-level elements are not supported, nor are the frame-specific elements <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>. However, content of the latter elements can be individually filtered through htmLawed.<br /> 788&#160; * &#160;htmLawed is meant for input that goes into the <span class="term">body</span>&#160;of HTML documents. HTML's head-level elements are not supported, nor are the frame-specific elements <span class="term">frameset</span>, <span class="term">frame</span>&#160;and <span class="term">noframes</span>. However, content of the latter elements can be individually filtered through htmLawed.<br />
789<br /> 789<br />
790&#160; * &#160;It cannot handle input that has non-HTML code like <span class="term">SVG</span>&#160;and <span class="term">MathML</span>. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in <a href="#s3.9">section 3.9</a>. A third way may be to some how take advantage of the <span class="term">$config["and_mark"]</span>&#160;parameter (see <a href="#s3.2">section 3.2</a>).<br /> 790&#160; * &#160;It cannot handle input that has non-HTML code like <span class="term">SVG</span>&#160;and <span class="term">MathML</span>. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in <a href="#s3.9">section 3.9</a>. A third way may be to some how take advantage of the <span class="term">$config["and_mark"]</span>&#160;parameter (see <a href="#s3.2">section 3.2</a>).<br />
791<br /> 791<br />
792&#160; * &#160;By default, htmLawed won't check many attribute values for standard compliance. E.g., <span class="term">width="20m"</span>&#160;with the dimension in non-standard <span class="term">m</span>&#160;is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) or <span class="term">$spec</span>&#160;to enforce finer checks on attribute values.<br /> 792&#160; * &#160;By default, htmLawed won't check many attribute values for standard compliance. E.g., <span class="term">width="20m"</span>&#160;with the dimension in non-standard <span class="term">m</span>&#160;is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) or <span class="term">$spec</span>&#160;to enforce finer checks on attribute values.<br />
793<br /> 793<br />
794&#160; * &#160;By default, htmLawed considers all ARIA, data-*, event and microdata attributes as global attributes and permits them in all elements. This is not strictly standard-compliant. E.g., the <span class="term">itemtype</span>&#160;microdata attribute is permitted only in elements that also have the <span class="term">itemscope</span>&#160;attribute. Admins can configure htmLawed to be more strict about this (<a href="#s2.3">section 2.3</a>).<br /> 794&#160; * &#160;By default, htmLawed considers all ARIA, data-*, event and microdata attributes as global attributes and permits them in all elements. This is not strictly standard-compliant. E.g., the <span class="term">itemtype</span>&#160;microdata attribute is permitted only in elements that also have the <span class="term">itemscope</span>&#160;attribute. Admins can configure htmLawed to be more strict about this (<a href="#s2.3">section 2.3</a>).<br />
795<br /> 795<br />
796&#160; * &#160;The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. However, <span class="term">$spec</span>&#160;can be used to allow custom attributes (<a href="#s2.3">section 2.3</a>).<br /> 796&#160; * &#160;The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. However, <span class="term">$spec</span>&#160;can be used to allow custom attributes (<a href="#s2.3">section 2.3</a>).<br />
797<br /> 797<br />
798&#160; * &#160;Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) or <span class="term">$spec</span>&#160;for finer checks. Perhaps the best option is to disallow <span class="term">style</span>&#160;but allow <span class="term">class</span>&#160;attributes with the right <span class="term">oneof</span>&#160;or <span class="term">match</span>&#160;values for <span class="term">class</span>, and have the various class style properties in <span class="term">.css</span>&#160;CSS stylesheet files.<br /> 798&#160; * &#160;Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) or <span class="term">$spec</span>&#160;for finer checks. Perhaps the best option is to disallow <span class="term">style</span>&#160;but allow <span class="term">class</span>&#160;attributes with the right <span class="term">oneof</span>&#160;or <span class="term">match</span>&#160;values for <span class="term">class</span>, and have the various class style properties in <span class="term">.css</span>&#160;CSS stylesheet files.<br />
799<br /> 799<br />
800&#160; * &#160;htmLawed does not parse emoticons, decode <em>BBcode</em>, or <em>wikify</em>, auto-converting text to proper HTML. Similarly, it won't convert line-breaks to <span class="term">br</span>&#160;elements. Such functions are beyond its purview. Admins should use other code to pre- or post-process the input for such purposes.<br /> 800&#160; * &#160;htmLawed does not parse emoticons, decode <em>BBcode</em>, or <em>wikify</em>, auto-converting text to proper HTML. Similarly, it won't convert line-breaks to <span class="term">br</span>&#160;elements. Such functions are beyond its purview. Admins should use other code to pre- or post-process the input for such purposes.<br />
801<br /> 801<br />
802&#160; * &#160;htmLawed cannot be used to have links force-opened in new windows (by auto-adding appropriate <span class="term">target</span>&#160;and <span class="term">onclick</span>&#160;attributes to <span class="term">a</span>). Admins should look at Javascript-based DOM-modifying solutions for this. Admins may also be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> 802&#160; * &#160;htmLawed cannot be used to have links force-opened in new windows (by auto-adding appropriate <span class="term">target</span>&#160;and <span class="term">onclick</span>&#160;attributes to <span class="term">a</span>). Admins should look at Javascript-based DOM-modifying solutions for this. Admins may also be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
803<br /> 803<br />
804&#160; * &#160;Nesting-based checks are not possible. E.g., one cannot disallow <span class="term">p</span>&#160;elements specifically inside <span class="term">td</span>&#160;while permitting it elsewhere. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> 804&#160; * &#160;Nesting-based checks are not possible. E.g., one cannot disallow <span class="term">p</span>&#160;elements specifically inside <span class="term">td</span>&#160;while permitting it elsewhere. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
805<br /> 805<br />
806&#160; * &#160;Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert <span class="term">http</span>&#160;to <span class="term">https</span>. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> 806&#160; * &#160;Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert <span class="term">http</span>&#160;to <span class="term">https</span>. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
807<br /> 807<br />
808&#160; * &#160;Pairs of opening and closing tags that do not enclose any content (like <span class="term">&lt;em&gt;&lt;/em&gt;</span>) are not removed. This may be against the standard specification for certain elements (e.g., <span class="term">table</span>). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.<br /> 808&#160; * &#160;Pairs of opening and closing tags that do not enclose any content (like <span class="term">&lt;em&gt;&lt;/em&gt;</span>) are not removed. This may be against the standard specification for certain elements (e.g., <span class="term">table</span>). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.<br />
809<br /> 809<br />
810&#160; * &#160;htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a <span class="term">table</span>, <span class="term">tbody</span>&#160;is allowed before <span class="term">tfoot</span>). Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> 810&#160; * &#160;htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a <span class="term">table</span>, <span class="term">tbody</span>&#160;is allowed before <span class="term">tfoot</span>). Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
811<br /> 811<br />
812&#160; * &#160;htmLawed does not check the number of nested elements. E.g., it will allow two <span class="term">caption</span>&#160;elements in a <span class="term">table</span>&#160;element, illegal as per standard specifications. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br /> 812&#160; * &#160;htmLawed does not check the number of nested elements. E.g., it will allow two <span class="term">caption</span>&#160;elements in a <span class="term">table</span>&#160;element, illegal as per standard specifications. Admins may be able to use a custom hook function to enforce such checks (<span class="term">hook_tag</span>&#160;parameter; see <a href="#s3.4.9">section 3.4.9</a>).<br />
813<br /> 813<br />
814&#160; * &#160;There are multiple ways to interpret ill-written HTML. E.g., in <span class="term">&lt;small&gt;&lt;small&gt;text&lt;/small&gt;</span>, is it that the second closing tag for <span class="term">small</span>&#160;is missing or is it that the second opening tag for <span class="term">small</span>&#160;was put in by mistake? htmLawed corrects the HTML in the string assuming the former, while the user may have intended the string for the latter. This is an issue that is impossible to address perfectly.<br /> 814&#160; * &#160;There are multiple ways to interpret ill-written HTML. E.g., in <span class="term">&lt;small&gt;&lt;small&gt;text&lt;/small&gt;</span>, is it that the second closing tag for <span class="term">small</span>&#160;is missing or is it that the second opening tag for <span class="term">small</span>&#160;was put in by mistake? htmLawed corrects the HTML in the string assuming the former, while the user may have intended the string for the latter. This is an issue that is impossible to address perfectly.<br />
815<br /> 815<br />
816&#160; * &#160;htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers (<span class="term">/&#42;</span>) in <span class="term">style</span>&#160;attribute values in order to detect malicious HTML like crafted, Internet Explorer browser-specific dynamic expressions like <span class="term">&amp;#101;xpression...</span>. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) to more specifically identify CSS expressions in the <span class="term">style</span>&#160;attribute values. Also, using <span class="term">$config["style_pass"]</span>, it is possible to have htmLawed pass <span class="term">style</span>&#160;attribute values without even looking at them (<a href="#s3.4.8">section 3.4.8</a>).<br /> 816&#160; * &#160;htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers (<span class="term">/&#42;</span>) in <span class="term">style</span>&#160;attribute values in order to detect malicious HTML like crafted, Internet Explorer browser-specific dynamic expressions like <span class="term">&amp;#101;xpression...</span>. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>) to more specifically identify CSS expressions in the <span class="term">style</span>&#160;attribute values. Also, using <span class="term">$config["style_pass"]</span>, it is possible to have htmLawed pass <span class="term">style</span>&#160;attribute values without even looking at them (<a href="#s3.4.8">section 3.4.8</a>).<br />
817<br /> 817<br />
818&#160; * &#160;htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., <span class="term">&lt;a href="http&#58;//x%22+style=%22background-image&#58;xss"&gt;x&lt;/a&gt;</span>). These arise when browsers mis-identify markup in <em>escaped</em>&#160;text, defeating the very purpose of escaping text (a bad browser will read the given example as <span class="term">&lt;a href="http&#58;//x" style="background-image&#58;xss"&gt;x&lt;/a&gt;</span>).<br /> 818&#160; * &#160;htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., <span class="term">&lt;a href="http&#58;//x%22+style=%22background-image&#58;xss"&gt;x&lt;/a&gt;</span>). These arise when browsers mis-identify markup in <em>escaped</em>&#160;text, defeating the very purpose of escaping text (a bad browser will read the given example as <span class="term">&lt;a href="http&#58;//x" style="background-image&#58;xss"&gt;x&lt;/a&gt;</span>).<br />
819<br /> 819<br />
820&#160; * &#160;Because of poor Unicode support in PHP, htmLawed does not remove the <em>high value</em>&#160;HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see <a href="#s3.1">section 3.1</a>).<br /> 820&#160; * &#160;Because of poor Unicode support in PHP, htmLawed does not remove the <em>high value</em>&#160;HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see <a href="#s3.1">section 3.1</a>).<br />
821<br /> 821<br />
822&#160; * &#160;htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML <span class="term">meta</span>&#160;tags, this can permit an exploit (like Google's <em>UTF-7/XSS</em>&#160;vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect.<br /> 822&#160; * &#160;htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML <span class="term">meta</span>&#160;tags, this can permit an exploit (like Google's <em>UTF-7/XSS</em>&#160;vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect.<br />
823<br /> 823<br />
824&#160; * &#160;htmLawed is expected to work with input texts in ASCII standard-compatible single-byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable-byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double-byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple-byte encoding. If an input text has such an encoding, administrators can use PHP's <a href="http://php.net/manual/en/book.iconv.php">iconv</a>&#160;functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed.<br /> 824&#160; * &#160;htmLawed is expected to work with input texts in ASCII standard-compatible single-byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable-byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double-byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple-byte encoding. If an input text has such an encoding, administrators can use PHP's <a href="http://php.net/manual/en/book.iconv.php">iconv</a>&#160;functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed.<br />
825<br /> 825<br />
826&#160; * &#160;Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts.<br /> 826&#160; * &#160;Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts.<br />
827 827
828</div> 828</div>
829<div class="sub-section"><h3> 829<div class="sub-section"><h3>
830<a name="s2.9" id="s2.9"></a><span class="item-no">2.9</span>&#160; Examples of usage 830<a name="s2.9" id="s2.9"></a><span class="item-no">2.9</span>&#160; Examples of usage
831</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 831</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
832<br /> 832<br />
833&#160; Safest, allowing only <em>safe</em>&#160;HTML markup --<br /> 833&#160; Safest, allowing only <em>safe</em>&#160;HTML markup --<br />
834<br /> 834<br />
835 835
836<code class="code">&#160; &#160; $config = array(&#39;safe&#39;=&gt;1);</code> 836<code class="code">&#160; &#160; $config = array(&#39;safe&#39;=&gt;1);</code>
837<br /> 837<br />
838 838
839<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code> 839<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
840<br /> 840<br />
841<br /> 841<br />
842&#160; Simplest, allowing all valid HTML markup including Javascript --<br /> 842&#160; Simplest, allowing all valid HTML markup including Javascript --<br />
843<br /> 843<br />
844 844
845<code class="code">&#160; &#160; $out = htmLawed($in);</code> 845<code class="code">&#160; &#160; $out = htmLawed($in);</code>
846<br /> 846<br />
847<br /> 847<br />
848&#160; Allowing all valid HTML markup but restricting URL schemes in <span class="term">src</span>&#160;attribute values to <span class="term">http</span>&#160;and <span class="term">https</span>&#160;--<br /> 848&#160; Allowing all valid HTML markup but restricting URL schemes in <span class="term">src</span>&#160;attribute values to <span class="term">http</span>&#160;and <span class="term">https</span>&#160;--<br />
849<br /> 849<br />
850 850
851<code class="code">&#160; &#160; $config = array(&#39;schemes&#39;=&gt;&#39;&#42;&#58;&#42;; src&#58;http, https&#39;);</code> 851<code class="code">&#160; &#160; $config = array(&#39;schemes&#39;=&gt;&#39;&#42;&#58;&#42;; src&#58;http, https&#39;);</code>
852<br /> 852<br />
853 853
854<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code> 854<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
855<br /> 855<br />
856<br /> 856<br />
857&#160; Allowing only <span class="term">safe</span>&#160;HTML and the elements <span class="term">a</span>, <span class="term">em</span>, and <span class="term">strong</span>&#160;--<br /> 857&#160; Allowing only <span class="term">safe</span>&#160;HTML and the elements <span class="term">a</span>, <span class="term">em</span>, and <span class="term">strong</span>&#160;--<br />
858<br /> 858<br />
859 859
860<code class="code">&#160; &#160; $config = array(&#39;safe&#39;=&gt;1, &#39;elements&#39;=&gt;&#39;a, em, strong&#39;);</code> 860<code class="code">&#160; &#160; $config = array(&#39;safe&#39;=&gt;1, &#39;elements&#39;=&gt;&#39;a, em, strong&#39;);</code>
861<br /> 861<br />
862 862
863<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code> 863<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
864<br /> 864<br />
865<br /> 865<br />
866&#160; Not allowing elements <span class="term">script</span>&#160;and <span class="term">object</span>&#160;--<br /> 866&#160; Not allowing elements <span class="term">script</span>&#160;and <span class="term">object</span>&#160;--<br />
867<br /> 867<br />
868 868
869<code class="code">&#160; &#160; $config = array(&#39;elements&#39;=&gt;&#39;&#42; -script -object&#39;);</code> 869<code class="code">&#160; &#160; $config = array(&#39;elements&#39;=&gt;&#39;&#42; -script -object&#39;);</code>
870<br /> 870<br />
871 871
872<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code> 872<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
873<br /> 873<br />
874<br /> 874<br />
875&#160; Not allowing attributes <span class="term">id</span>&#160;and <span class="term">style</span>&#160;--<br /> 875&#160; Not allowing attributes <span class="term">id</span>&#160;and <span class="term">style</span>&#160;--<br />
876<br /> 876<br />
877 877
878<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;id, style&#39;);</code> 878<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;id, style&#39;);</code>
879<br /> 879<br />
880 880
881<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code> 881<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
882<br /> 882<br />
883<br /> 883<br />
884&#160; Permitting only attributes <span class="term">title</span>&#160;and <span class="term">href</span>&#160;--<br /> 884&#160; Permitting only attributes <span class="term">title</span>&#160;and <span class="term">href</span>&#160;--<br />
885<br /> 885<br />
886 886
887<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;&#42; -title -href&#39;);</code> 887<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;&#42; -title -href&#39;);</code>
888<br /> 888<br />
889 889
890<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code> 890<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
891<br /> 891<br />
892<br /> 892<br />
893&#160; Remove bad/disallowed tags altogether instead of converting them to entities --<br /> 893&#160; Remove bad/disallowed tags altogether instead of converting them to entities --<br />
894<br /> 894<br />
895 895
896<code class="code">&#160; &#160; $config = array(&#39;keep_bad&#39;=&gt;0);</code> 896<code class="code">&#160; &#160; $config = array(&#39;keep_bad&#39;=&gt;0);</code>
897<br /> 897<br />
898 898
899<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code> 899<code class="code">&#160; &#160; $out = htmLawed($in, $config);</code>
900<br /> 900<br />
901<br /> 901<br />
902&#160; Allowing attribute <span class="term">title</span>&#160;only in <span class="term">a</span>&#160;and not allowing attributes <span class="term">id</span>, <span class="term">style</span>, or scriptable <em>on*</em>&#160;attributes like <span class="term">onclick</span>&#160;--<br /> 902&#160; Allowing attribute <span class="term">title</span>&#160;only in <span class="term">a</span>&#160;and not allowing attributes <span class="term">id</span>, <span class="term">style</span>, or scriptable <em>on*</em>&#160;attributes like <span class="term">onclick</span>&#160;--<br />
903<br /> 903<br />
904 904
905<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;title, id, style, on&#42;&#39;);</code> 905<code class="code">&#160; &#160; $config = array(&#39;deny_attribute&#39;=&gt;&#39;title, id, style, on&#42;&#39;);</code>
906<br /> 906<br />
907 907
908<code class="code">&#160; &#160; $spec = &#39;a=title&#39;;</code> 908<code class="code">&#160; &#160; $spec = &#39;a=title&#39;;</code>
909<br /> 909<br />
910 910
911<code class="code">&#160; &#160; $out = htmLawed($in, $config, $spec);</code> 911<code class="code">&#160; &#160; $out = htmLawed($in, $config, $spec);</code>
912<br /> 912<br />
913<br /> 913<br />
914&#160; Allowing a custom attribute, <span class="term">vFlag</span>, in <span class="term">img</span>&#160;and permitting custom use of the standard attribute, <span class="term">rel</span>, in <span class="term">input</span>&#160;--<br /> 914&#160; Allowing a custom attribute, <span class="term">vFlag</span>, in <span class="term">img</span>&#160;and permitting custom use of the standard attribute, <span class="term">rel</span>, in <span class="term">input</span>&#160;--<br />
915<br /> 915<br />
916 916
917<code class="code">&#160; &#160; $spec = &#39;img=vFlag; input=rel&#39;;</code> 917<code class="code">&#160; &#160; $spec = &#39;img=vFlag; input=rel&#39;;</code>
918<br /> 918<br />
919 919
920<code class="code">&#160; &#160; $out = htmLawed($in, $config, $spec);</code> 920<code class="code">&#160; &#160; $out = htmLawed($in, $config, $spec);</code>
921<br /> 921<br />
922<br /> 922<br />
923&#160; Some case-studies are presented below.<br /> 923&#160; Some case-studies are presented below.<br />
924<br /> 924<br />
925&#160; <strong>1.</strong>&#160;A blog administrator wants to allow only <span class="term">a</span>, <span class="term">em</span>, <span class="term">strike</span>, <span class="term">strong</span>&#160;and <span class="term">u</span>&#160;in comments, but needs <span class="term">strike</span>&#160;and <span class="term">u</span>&#160;transformed to <span class="term">span</span>&#160;for better XHTML 1-strict compliance, and, he wants the <span class="term">a</span>&#160;links to point only to <span class="term">http</span>&#160;or <span class="term">https</span>&#160;resources:<br /> 925&#160; <strong>1.</strong>&#160;A blog administrator wants to allow only <span class="term">a</span>, <span class="term">em</span>, <span class="term">strike</span>, <span class="term">strong</span>&#160;and <span class="term">u</span>&#160;in comments, but needs <span class="term">strike</span>&#160;and <span class="term">u</span>&#160;transformed to <span class="term">span</span>&#160;for better XHTML 1-strict compliance, and, he wants the <span class="term">a</span>&#160;links to point only to <span class="term">http</span>&#160;or <span class="term">https</span>&#160;resources:<br />
926<br /> 926<br />
927 927
928<code class="code">&#160; &#160; $processed = htmLawed($in, array(&#39;elements&#39;=&gt;&#39;a, em, strike, strong, u&#39;, &#39;make_tag_strict&#39;=&gt;1, &#39;safe&#39;=&gt;1, &#39;schemes&#39;=&gt;&#39;&#42;&#58;http, https&#39;), &#39;a=href&#39;);</code> 928<code class="code">&#160; &#160; $processed = htmLawed($in, array(&#39;elements&#39;=&gt;&#39;a, em, strike, strong, u&#39;, &#39;make_tag_strict&#39;=&gt;1, &#39;safe&#39;=&gt;1, &#39;schemes&#39;=&gt;&#39;&#42;&#58;http, https&#39;), &#39;a=href&#39;);</code>
929<br /> 929<br />
930<br /> 930<br />
931&#160; <strong>2.</strong>&#160;An author uses a custom-made web application to load content on his website. He is the only one using that application and the content he generates has all types of HTML, including scripts. The web application uses htmLawed primarily as a tool to correct errors that creep in while writing HTML and to take care of the occasional <em>bad</em>&#160;characters in copy-paste text introduced by Microsoft Office. The web application provides a preview before submitted input is added to the content. For the previewing process, htmLawed is set up as follows:<br /> 931&#160; <strong>2.</strong>&#160;An author uses a custom-made web application to load content on his website. He is the only one using that application and the content he generates has all types of HTML, including scripts. The web application uses htmLawed primarily as a tool to correct errors that creep in while writing HTML and to take care of the occasional <em>bad</em>&#160;characters in copy-paste text introduced by Microsoft Office. The web application provides a preview before submitted input is added to the content. For the previewing process, htmLawed is set up as follows:<br />
932<br /> 932<br />
933 933
934<code class="code">&#160; &#160; $processed = htmLawed($in, array(&#39;css_expression&#39;=&gt;1, &#39;keep_bad&#39;=&gt;1, &#39;make_tag_strict&#39;=&gt;1, &#39;schemes&#39;=&gt;&#39;&#42;&#58;&#42;&#39;, &#39;valid_xhtml&#39;=&gt;1));</code> 934<code class="code">&#160; &#160; $processed = htmLawed($in, array(&#39;css_expression&#39;=&gt;1, &#39;keep_bad&#39;=&gt;1, &#39;make_tag_strict&#39;=&gt;1, &#39;schemes&#39;=&gt;&#39;&#42;&#58;&#42;&#39;, &#39;valid_xhtml&#39;=&gt;1));</code>
935<br /> 935<br />
936<br /> 936<br />
937&#160; For the final submission process, <span class="term">keep_bad</span>&#160;is set to <span class="term">6</span>. A value of <span class="term">1</span>&#160;for the preview process allows the author to note and correct any HTML mistake without losing any of the typed text.<br /> 937&#160; For the final submission process, <span class="term">keep_bad</span>&#160;is set to <span class="term">6</span>. A value of <span class="term">1</span>&#160;for the preview process allows the author to note and correct any HTML mistake without losing any of the typed text.<br />
938<br /> 938<br />
939&#160; <strong>3.</strong>&#160;A data-miner is scraping information in a specific table of similar web-pages and is collating the data rows, and uses htmLawed to reduce unnecessary markup and white-spaces:<br /> 939&#160; <strong>3.</strong>&#160;A data-miner is scraping information in a specific table of similar web-pages and is collating the data rows, and uses htmLawed to reduce unnecessary markup and white-spaces:<br />
940<br /> 940<br />
941 941
942<code class="code">&#160; &#160; $processed = htmLawed($in, array(&#39;elements&#39;=&gt;&#39;tr, td&#39;, &#39;tidy&#39;=&gt;-1), &#39;tr, td =&#39;);</code> 942<code class="code">&#160; &#160; $processed = htmLawed($in, array(&#39;elements&#39;=&gt;&#39;tr, td&#39;, &#39;tidy&#39;=&gt;-1), &#39;tr, td =&#39;);</code>
943<br /> 943<br />
944 944
945</div> 945</div>
946</div> 946</div>
947<div class="section"><h2> 947<div class="section"><h2>
948<a name="s3" id="s3"></a><span class="item-no">3</span>&#160; Details 948<a name="s3" id="s3"></a><span class="item-no">3</span>&#160; Details
949</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 949</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
950<div class="sub-section"><h3> 950<div class="sub-section"><h3>
951<a name="s3.1" id="s3.1"></a><span class="item-no">3.1</span>&#160; Invalid/dangerous characters 951<a name="s3.1" id="s3.1"></a><span class="item-no">3.1</span>&#160; Invalid/dangerous characters
952</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 952</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
953<br /> 953<br />
954&#160; Valid characters (more correctly, their code-points) in HTML or XML are, hexadecimally, <span class="term">9</span>, <span class="term">a</span>, <span class="term">d</span>, <span class="term">20</span>&#160;to <span class="term">d7ff</span>, and <span class="term">e000</span>&#160;to <span class="term">10ffff</span>, except <span class="term">fffe</span>&#160;and <span class="term">ffff</span>&#160;(decimally, <span class="term">9</span>, <span class="term">10</span>, <span class="term">13</span>, <span class="term">32</span>&#160;to <span class="term">55295</span>, and <span class="term">57344</span>&#160;to <span class="term">1114111</span>, except <span class="term">65534</span>&#160;and <span class="term">65535</span>). htmLawed removes the invalid characters <span class="term">0</span>&#160;to <span class="term">8</span>, <span class="term">b</span>, <span class="term">c</span>, and <span class="term">e</span>&#160;to <span class="term">1f</span>.<br /> 954&#160; Valid characters (more correctly, their code-points) in HTML or XML are, hexadecimally, <span class="term">9</span>, <span class="term">a</span>, <span class="term">d</span>, <span class="term">20</span>&#160;to <span class="term">d7ff</span>, and <span class="term">e000</span>&#160;to <span class="term">10ffff</span>, except <span class="term">fffe</span>&#160;and <span class="term">ffff</span>&#160;(decimally, <span class="term">9</span>, <span class="term">10</span>, <span class="term">13</span>, <span class="term">32</span>&#160;to <span class="term">55295</span>, and <span class="term">57344</span>&#160;to <span class="term">1114111</span>, except <span class="term">65534</span>&#160;and <span class="term">65535</span>). htmLawed removes the invalid characters <span class="term">0</span>&#160;to <span class="term">8</span>, <span class="term">b</span>, <span class="term">c</span>, and <span class="term">e</span>&#160;to <span class="term">1f</span>.<br />
955<br /> 955<br />
956&#160; Because of PHP's poor native support for multi-byte characters, htmLawed cannot check for the remaining invalid code-points. However, for various reasons, it is very unlikely for any of those characters to be in the input.<br /> 956&#160; Because of PHP's poor native support for multi-byte characters, htmLawed cannot check for the remaining invalid code-points. However, for various reasons, it is very unlikely for any of those characters to be in the input.<br />
957<br /> 957<br />
958&#160; Characters that are discouraged (see <a href="#s5.1">section 5.1</a>) but not invalid are not removed by htmLawed.<br /> 958&#160; Characters that are discouraged (see <a href="#s5.1">section 5.1</a>) but not invalid are not removed by htmLawed.<br />
959<br /> 959<br />
960&#160; It (function <span class="term">hl_tag()</span>) also replaces the potentially dangerous (in some Mozilla [Firefox] and Opera browsers) soft-hyphen character (code-point, hexadecimally, <span class="term">ad</span>, or decimally, <span class="term">173</span>) in attribute values with spaces. Where required, the characters <span class="term">&lt;</span>, <span class="term">&gt;</span>, <span class="term">&amp;</span>, and <span class="term">"</span>&#160;are converted to entities.<br /> 960&#160; It (function <span class="term">hl_tag()</span>) also replaces the potentially dangerous (in some Mozilla [Firefox] and Opera browsers) soft-hyphen character (code-point, hexadecimally, <span class="term">ad</span>, or decimally, <span class="term">173</span>) in attribute values with spaces. Where required, the characters <span class="term">&lt;</span>, <span class="term">&gt;</span>, <span class="term">&amp;</span>, and <span class="term">"</span>&#160;are converted to entities.<br />
961<br /> 961<br />
962&#160; With <span class="term">$config["clean_ms_char"]</span>&#160;set as <span class="term">1</span>&#160;or <span class="term">2</span>, many of the discouraged characters (decimal code-points <span class="term">127</span>&#160;to <span class="term">159</span>&#160;except <span class="term">133</span>) that many Microsoft applications incorrectly use (as per the <span class="term">Windows 1252</span>&#160;[<span class="term">Cp-1252</span>] or a similar encoding system), and the character for decimal code-point <span class="term">133</span>, are converted to appropriate decimal numerical entities (or removed for a few cases)-- see appendix in <a href="#s5.4">section 5.4</a>. This can help avoid some display issues arising from copying-pasting of content.<br /> 962&#160; With <span class="term">$config["clean_ms_char"]</span>&#160;set as <span class="term">1</span>&#160;or <span class="term">2</span>, many of the discouraged characters (decimal code-points <span class="term">127</span>&#160;to <span class="term">159</span>&#160;except <span class="term">133</span>) that many Microsoft applications incorrectly use (as per the <span class="term">Windows 1252</span>&#160;[<span class="term">Cp-1252</span>] or a similar encoding system), and the character for decimal code-point <span class="term">133</span>, are converted to appropriate decimal numerical entities (or removed for a few cases)-- see appendix in <a href="#s5.4">section 5.4</a>. This can help avoid some display issues arising from copying-pasting of content.<br />
963<br /> 963<br />
964&#160; With <span class="term">$config["clean_ms_char"]</span>&#160;set as <span class="term">2</span>, characters for the hexadecimal code-points <span class="term">82</span>, <span class="term">91</span>, and <span class="term">92</span>&#160;(for special single-quotes), and <span class="term">84</span>, <span class="term">93</span>, and <span class="term">94</span>&#160;(for special double-quotes) are converted to ordinary single and double quotes respectively and not to entities.<br /> 964&#160; With <span class="term">$config["clean_ms_char"]</span>&#160;set as <span class="term">2</span>, characters for the hexadecimal code-points <span class="term">82</span>, <span class="term">91</span>, and <span class="term">92</span>&#160;(for special single-quotes), and <span class="term">84</span>, <span class="term">93</span>, and <span class="term">94</span>&#160;(for special double-quotes) are converted to ordinary single and double quotes respectively and not to entities.<br />
965<br /> 965<br />
966&#160; The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.<br /> 966&#160; The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.<br />
967<br /> 967<br />
968&#160; The <span class="term">$config["clean_ms_char"]</span>&#160;parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the <span class="term">Windows 1252</span>&#160;(<span class="term">Cp-1252</span>) or a similar encoding like <span class="term">Cp-1251</span>&#160;(otherwise, for example when UTF-8 encoding is in use, Japanese or Korean characters can get mangled). Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br /> 968&#160; The <span class="term">$config["clean_ms_char"]</span>&#160;parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the <span class="term">Windows 1252</span>&#160;(<span class="term">Cp-1252</span>) or a similar encoding like <span class="term">Cp-1251</span>&#160;(otherwise, for example when UTF-8 encoding is in use, Japanese or Korean characters can get mangled). Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.<br />
969 969
970</div> 970</div>
971<div class="sub-section"><h3> 971<div class="sub-section"><h3>
972<a name="s3.2" id="s3.2"></a><span class="item-no">3.2</span>&#160; Character references/entities 972<a name="s3.2" id="s3.2"></a><span class="item-no">3.2</span>&#160; Character references/entities
973</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 973</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
974<br /> 974<br />
975&#160; Valid character entities take the form <span class="term">&amp;&#42;;</span>&#160;where <span class="term">&#42;</span>&#160;is <span class="term">#x</span>&#160;followed by a hexadecimal number (hexadecimal numeric entity; like <span class="term">&amp;#xA0;</span>&#160;for non-breaking space), or alphanumeric like <span class="term">gt</span>&#160;(external or named entity; like <span class="term">&amp;nbsp;</span>&#160;for non-breaking space), or <span class="term">#</span>&#160;followed by a number (decimal numeric entity; like <span class="term">&amp;#160;</span>&#160;for non-breaking space). Character entities referring to the soft-hyphen character (the <span class="term">&amp;shy;</span>&#160;or <span class="term">\xad</span>&#160;character; hexadecimal code-point <span class="term">ad</span>&#160;[decimal <span class="term">173</span>]) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.<br /> 975&#160; Valid character entities take the form <span class="term">&amp;&#42;;</span>&#160;where <span class="term">&#42;</span>&#160;is <span class="term">#x</span>&#160;followed by a hexadecimal number (hexadecimal numeric entity; like <span class="term">&amp;#xA0;</span>&#160;for non-breaking space), or alphanumeric like <span class="term">gt</span>&#160;(external or named entity; like <span class="term">&amp;nbsp;</span>&#160;for non-breaking space), or <span class="term">#</span>&#160;followed by a number (decimal numeric entity; like <span class="term">&amp;#160;</span>&#160;for non-breaking space). Character entities referring to the soft-hyphen character (the <span class="term">&amp;shy;</span>&#160;or <span class="term">\xad</span>&#160;character; hexadecimal code-point <span class="term">ad</span>&#160;[decimal <span class="term">173</span>]) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.<br />
976<br /> 976<br />
977&#160; htmLawed (function <span class="term">hl_ent()</span>):<br /> 977&#160; htmLawed (function <span class="term">hl_ent()</span>):<br />
978<br /> 978<br />
979&#160; * &#160;Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous)<br /> 979&#160; * &#160;Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous)<br />
980<br /> 980<br />
981&#160; * &#160;Lowercases the <span class="term">X</span>&#160;(for XML-compliance) and <span class="term">A-F</span>&#160;of hexadecimal numeric entities<br /> 981&#160; * &#160;Lowercases the <span class="term">X</span>&#160;(for XML-compliance) and <span class="term">A-F</span>&#160;of hexadecimal numeric entities<br />
982<br /> 982<br />
983&#160; * &#160;Neutralizes entities referring to characters that are HTML-invalid (see <a href="#s3.1">section 3.1</a>)<br /> 983&#160; * &#160;Neutralizes entities referring to characters that are HTML-invalid (see <a href="#s3.1">section 3.1</a>)<br />
984<br /> 984<br />
985&#160; * &#160;Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, <span class="term">7f</span>&#160;to <span class="term">84</span>, <span class="term">86</span>&#160;to <span class="term">9f</span>, and <span class="term">fdd0</span>&#160;to <span class="term">fddf</span>, or decimally, <span class="term">127</span>&#160;to <span class="term">132</span>, <span class="term">134</span>&#160;to <span class="term">159</span>, and <span class="term">64991</span>&#160;to <span class="term">64976</span>). Entities referring to the remaining discouraged characters (see <a href="#s5.1">section 5.1</a>&#160;for a full list) are let through.<br /> 985&#160; * &#160;Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, <span class="term">7f</span>&#160;to <span class="term">84</span>, <span class="term">86</span>&#160;to <span class="term">9f</span>, and <span class="term">fdd0</span>&#160;to <span class="term">fddf</span>, or decimally, <span class="term">127</span>&#160;to <span class="term">132</span>, <span class="term">134</span>&#160;to <span class="term">159</span>, and <span class="term">64991</span>&#160;to <span class="term">64976</span>). Entities referring to the remaining discouraged characters (see <a href="#s5.1">section 5.1</a>&#160;for a full list) are let through.<br />
986<br /> 986<br />
987&#160; * &#160;Neutralizes named entities that are not in the specifications<br /> 987&#160; * &#160;Neutralizes named entities that are not in the specifications<br />
988<br /> 988<br />
989&#160; * &#160;Optionally converts valid HTML-specific named entities except <span class="term">&amp;gt;</span>, <span class="term">&amp;lt;</span>, <span class="term">&amp;quot;</span>, and <span class="term">&amp;amp;</span>&#160;to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is <span class="term">2</span>) for generic XML-compliance. For this, <span class="term">$config["named_entity"]</span>&#160;should be <span class="term">1</span>.<br /> 989&#160; * &#160;Optionally converts valid HTML-specific named entities except <span class="term">&amp;gt;</span>, <span class="term">&amp;lt;</span>, <span class="term">&amp;quot;</span>, and <span class="term">&amp;amp;</span>&#160;to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is <span class="term">2</span>) for generic XML-compliance. For this, <span class="term">$config["named_entity"]</span>&#160;should be <span class="term">1</span>.<br />
990<br /> 990<br />
991&#160; * &#160;Optionally converts hexadecimal numeric entities to the more widely supported decimal ones. For this, <span class="term">$config["hexdec_entity"]</span>&#160;should be <span class="term">0</span>.<br /> 991&#160; * &#160;Optionally converts hexadecimal numeric entities to the more widely supported decimal ones. For this, <span class="term">$config["hexdec_entity"]</span>&#160;should be <span class="term">0</span>.<br />
992<br /> 992<br />
993&#160; * &#160;Optionally converts decimal numeric entities to the hexadecimal ones. For this, <span class="term">$config["hexdec_entity"]</span>&#160;should be <span class="term">2</span>.<br /> 993&#160; * &#160;Optionally converts decimal numeric entities to the hexadecimal ones. For this, <span class="term">$config["hexdec_entity"]</span>&#160;should be <span class="term">2</span>.<br />
994<br /> 994<br />
995&#160; <em>Neutralization</em>&#160;refers to the <em>entitification</em>&#160;of <span class="term">&amp;</span>&#160;to <span class="term">&amp;amp;</span>.<br /> 995&#160; <em>Neutralization</em>&#160;refers to the <em>entitification</em>&#160;of <span class="term">&amp;</span>&#160;to <span class="term">&amp;amp;</span>.<br />
996<br /> 996<br />
997&#160; <strong>Note</strong>: htmLawed does not convert entities to the actual characters represented by them; one can pass the htmLawed output through PHP's <span class="term">html_entity_decode</span>&#160;<a href="http://www.php.net/html_entity_decode">function</a>&#160;for that.<br /> 997&#160; <strong>Note</strong>: htmLawed does not convert entities to the actual characters represented by them; one can pass the htmLawed output through PHP's <span class="term">html_entity_decode</span>&#160;<a href="http://www.php.net/html_entity_decode">function</a>&#160;for that.<br />
998<br /> 998<br />
999&#160; <strong>Note</strong>: If <span class="term">$config["and_mark"]</span>&#160;is set, and set to a value other than <span class="term">0</span>, then the <span class="term">&amp;</span>&#160;characters in the original input are replaced with the control character for the hexadecimal code-point <span class="term">6</span>&#160;(<span class="term">\x06</span>; <span class="term">&amp;</span>&#160;characters introduced by htmLawed, e.g., after converting <span class="term">&lt;</span>&#160;to <span class="term">&amp;lt;</span>, are not affected). This allows one to distinguish, say, an <span class="term">&amp;gt;</span>&#160;introduced by htmLawed and an <span class="term">&amp;gt;</span>&#160;put in by the input writer, and can be helpful in further processing of the htmLawed-processed text (e.g., to identify the character sequence <span class="term">o(&gt;&lt;)o</span>&#160;to generate an emoticon image). When this feature is active, admins should ensure that the htmLawed output is not directly used in web pages or XML documents as the presence of the <span class="term">\x06</span>&#160;can break documents. Before use in such documents, and preferably before any storage, any remaining <span class="term">\x06</span>&#160;should be changed back to <span class="term">&amp;</span>, e.g., with:<br /> 999&#160; <strong>Note</strong>: If <span class="term">$config["and_mark"]</span>&#160;is set, and set to a value other than <span class="term">0</span>, then the <span class="term">&amp;</span>&#160;characters in the original input are replaced with the control character for the hexadecimal code-point <span class="term">6</span>&#160;(<span class="term">\x06</span>; <span class="term">&amp;</span>&#160;characters introduced by htmLawed, e.g., after converting <span class="term">&lt;</span>&#160;to <span class="term">&amp;lt;</span>, are not affected). This allows one to distinguish, say, an <span class="term">&amp;gt;</span>&#160;introduced by htmLawed and an <span class="term">&amp;gt;</span>&#160;put in by the input writer, and can be helpful in further processing of the htmLawed-processed text (e.g., to identify the character sequence <span class="term">o(&gt;&lt;)o</span>&#160;to generate an emoticon image). When this feature is active, admins should ensure that the htmLawed output is not directly used in web pages or XML documents as the presence of the <span class="term">\x06</span>&#160;can break documents. Before use in such documents, and preferably before any storage, any remaining <span class="term">\x06</span>&#160;should be changed back to <span class="term">&amp;</span>, e.g., with:<br />
1000<br /> 1000<br />
1001 1001
1002<code class="code">&#160; &#160; $final = str_replace("\x06", &#39;&amp;&#39;, $prelim);</code> 1002<code class="code">&#160; &#160; $final = str_replace("\x06", &#39;&amp;&#39;, $prelim);</code>
1003<br /> 1003<br />
1004<br /> 1004<br />
1005&#160; Also, see <a href="#s3.9">section 3.9</a>.<br /> 1005&#160; Also, see <a href="#s3.9">section 3.9</a>.<br />
1006 1006
1007</div> 1007</div>
1008<div class="sub-section"><h3> 1008<div class="sub-section"><h3>
1009<a name="s3.3" id="s3.3"></a><span class="item-no">3.3</span>&#160; HTML elements 1009<a name="s3.3" id="s3.3"></a><span class="item-no">3.3</span>&#160; HTML elements
1010</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1010</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1011<br /> 1011<br />
1012&#160; htmLawed can be configured to allow only certain HTML elements (tags) in the input. Disallowed elements (just tag-content, and not element-content), based on <span class="term">$config["keep_bad"]</span>, are either <em>neutralized</em>&#160;(converted to plain text by entitification of <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>) or removed.<br /> 1012&#160; htmLawed can be configured to allow only certain HTML elements (tags) in the input. Disallowed elements (just tag-content, and not element-content), based on <span class="term">$config["keep_bad"]</span>, are either <em>neutralized</em>&#160;(converted to plain text by entitification of <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>) or removed.<br />
1013<br /> 1013<br />
1014&#160; E.g., with only <span class="term">em</span>&#160;permitted:<br /> 1014&#160; E.g., with only <span class="term">em</span>&#160;permitted:<br />
1015<br /> 1015<br />
1016&#160; Input:<br /> 1016&#160; Input:<br />
1017<br /> 1017<br />
1018 1018
1019<code class="code">&#160; &#160; &#160; &lt;em&gt;My&lt;/em&gt; website is &lt;a href="http&#58;//a.com&gt;a.com&lt;/a&gt;.</code> 1019<code class="code">&#160; &#160; &#160; &lt;em&gt;My&lt;/em&gt; website is &lt;a href="http&#58;//a.com&gt;a.com&lt;/a&gt;.</code>
1020<br /> 1020<br />
1021<br /> 1021<br />
1022&#160; Output, with <span class="term">$config["keep_bad"] = 0</span>:<br /> 1022&#160; Output, with <span class="term">$config["keep_bad"] = 0</span>:<br />
1023<br /> 1023<br />
1024 1024
1025<code class="code">&#160; &#160; &#160; &lt;em&gt;My&lt;/em&gt; website is a.com.</code> 1025<code class="code">&#160; &#160; &#160; &lt;em&gt;My&lt;/em&gt; website is a.com.</code>
1026<br /> 1026<br />
1027<br /> 1027<br />
1028&#160; Output, with <span class="term">$config["keep_bad"]</span>&#160;not <span class="term">0</span>:<br /> 1028&#160; Output, with <span class="term">$config["keep_bad"]</span>&#160;not <span class="term">0</span>:<br />
1029<br /> 1029<br />
1030 1030
1031<code class="code">&#160; &#160; &#160; &lt;em&gt;My&lt;/em&gt; website is &amp;lt;a href=""&amp;gt;a.com&amp;lt;/a&amp;gt;.</code> 1031<code class="code">&#160; &#160; &#160; &lt;em&gt;My&lt;/em&gt; website is &amp;lt;a href=""&amp;gt;a.com&amp;lt;/a&amp;gt;.</code>
1032<br /> 1032<br />
1033<br /> 1033<br />
1034&#160; See <a href="#s3.3.3">section 3.3.3</a>&#160;for differences between the various non-zero <span class="term">$config["keep_bad"]</span>&#160;values.<br /> 1034&#160; See <a href="#s3.3.3">section 3.3.3</a>&#160;for differences between the various non-zero <span class="term">$config["keep_bad"]</span>&#160;values.<br />
1035<br /> 1035<br />
1036&#160; htmLawed by default permits these 118 HTML elements:<br /> 1036&#160; htmLawed by default permits these 118 HTML elements:<br />
1037<br /> 1037<br />
1038 1038
1039<code class="code">&#160; &#160; a, abbr, acronym, address, applet, area, article, aside, audio, b, bdi, bdo, big, blockquote, br, button, canvas, caption, center, cite, code, col, colgroup, command, data, datalist, dd, del, details, dfn, dir, div, dl, dt, em, embed, fieldset, figcaption, figure, font, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, iframe, img, input, ins, isindex, kbd, keygen, label, legend, li, link, main, map, mark, menu, meta, meter, nav, noscript, object, ol, optgroup, option, output, p, param, pre, progress, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, section, select, small, source, span, strike, strong, style, sub, summary, sup, table, tbody, td, textarea, tfoot, th, thead, time, tr, track, tt, u, ul, var, video, wbr</code> 1039<code class="code">&#160; &#160; a, abbr, acronym, address, applet, area, article, aside, audio, b, bdi, bdo, big, blockquote, br, button, canvas, caption, center, cite, code, col, colgroup, command, data, datalist, dd, del, details, dfn, dir, div, dl, dt, em, embed, fieldset, figcaption, figure, font, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, iframe, img, input, ins, isindex, kbd, keygen, label, legend, li, link, main, map, mark, menu, meta, meter, nav, noscript, object, ol, optgroup, option, output, p, param, pre, progress, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, section, select, small, source, span, strike, strong, style, sub, summary, sup, table, tbody, td, textarea, tfoot, th, thead, time, tr, track, tt, u, ul, var, video, wbr</code>
1040<br /> 1040<br />
1041<br /> 1041<br />
1042&#160; The HTML version 4 elements <span class="term">acronym</span>, <span class="term">applet</span>, <span class="term">big</span>, <span class="term">center</span>, <span class="term">dir</span>, <span class="term">font</span>, <span class="term">strike</span>, and <span class="term">tt</span>&#160;are obsolete/deprecated in HTML version 5. On the other hand, the obsolete/deprecated HTML 4 elements <span class="term">embed</span>, <span class="term">menu</span>&#160;and <span class="term">u</span>&#160;are no longer so in HTML 5. Elements new to HTML 5 are <span class="term">article</span>, <span class="term">aside</span>, <span class="term">audio</span>, <span class="term">bdi</span>, <span class="term">canvas</span>, <span class="term">command</span>, <span class="term">data</span>, <span class="term">datalist</span>, <span class="term">details</span>, <span class="term">figure</span>, <span class="term">figcaption</span>, <span class="term">footer</span>, <span class="term">header</span>, <span class="term">hgroup</span>, <span class="term">keygen</span>, <span class="term">link</span>, <span class="term">main</span>, <span class="term">mark</span>, <span class="term">meta</span>, <span class="term">meter</span>, <span class="term">nav</span>, <span class="term">output</span>, <span class="term">progress</span>, <span class="term">section</span>, <span class="term">source</span>, <span class="term">style</span>, <span class="term">summary</span>, <span class="term">time</span>, <span class="term">track</span>, <span class="term">video</span>, and <span class="term">wbr</span>. The <span class="term">link</span>, <span class="term">meta</span>&#160;and <span class="term">style</span>&#160;elements exist in HTML 4 but are not allowed in the HTML body. These 16 elements are <em>empty</em>&#160;elements that have an opening tag with possible content but no element content (thus, no closing tag): <span class="term">area</span>, <span class="term">br</span>, <span class="term">col</span>, <span class="term">command</span>, <span class="term">embed</span>, <span class="term">hr</span>, <span class="term">img</span>, <span class="term">input</span>, <span class="term">isindex</span>, <span class="term">keygen</span>, <span class="term">link</span>, <span class="term">meta</span>, <span class="term">param</span>, <span class="term">source</span>, <span class="term">track</span>, and <span class="term">wbr</span>.<br /> 1042&#160; The HTML version 4 elements <span class="term">acronym</span>, <span class="term">applet</span>, <span class="term">big</span>, <span class="term">center</span>, <span class="term">dir</span>, <span class="term">font</span>, <span class="term">strike</span>, and <span class="term">tt</span>&#160;are obsolete/deprecated in HTML version 5. On the other hand, the obsolete/deprecated HTML 4 elements <span class="term">embed</span>, <span class="term">menu</span>&#160;and <span class="term">u</span>&#160;are no longer so in HTML 5. Elements new to HTML 5 are <span class="term">article</span>, <span class="term">aside</span>, <span class="term">audio</span>, <span class="term">bdi</span>, <span class="term">canvas</span>, <span class="term">command</span>, <span class="term">data</span>, <span class="term">datalist</span>, <span class="term">details</span>, <span class="term">figure</span>, <span class="term">figcaption</span>, <span class="term">footer</span>, <span class="term">header</span>, <span class="term">hgroup</span>, <span class="term">keygen</span>, <span class="term">link</span>, <span class="term">main</span>, <span class="term">mark</span>, <span class="term">meta</span>, <span class="term">meter</span>, <span class="term">nav</span>, <span class="term">output</span>, <span class="term">progress</span>, <span class="term">section</span>, <span class="term">source</span>, <span class="term">style</span>, <span class="term">summary</span>, <span class="term">time</span>, <span class="term">track</span>, <span class="term">video</span>, and <span class="term">wbr</span>. The <span class="term">link</span>, <span class="term">meta</span>&#160;and <span class="term">style</span>&#160;elements exist in HTML 4 but are not allowed in the HTML body. These 16 elements are <em>empty</em>&#160;elements that have an opening tag with possible content but no element content (thus, no closing tag): <span class="term">area</span>, <span class="term">br</span>, <span class="term">col</span>, <span class="term">command</span>, <span class="term">embed</span>, <span class="term">hr</span>, <span class="term">img</span>, <span class="term">input</span>, <span class="term">isindex</span>, <span class="term">keygen</span>, <span class="term">link</span>, <span class="term">meta</span>, <span class="term">param</span>, <span class="term">source</span>, <span class="term">track</span>, and <span class="term">wbr</span>.<br />
1043<br /> 1043<br />
1044&#160; With <span class="term">$config["safe"] = 1</span>, the default set will exclude <span class="term">applet</span>, <span class="term">audio</span>, <span class="term">canvas</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">object</span>, <span class="term">script</span>&#160;and <span class="term">video</span>; see <a href="#s3.6">section 3.6</a>.<br /> 1044&#160; With <span class="term">$config["safe"] = 1</span>, the default set will exclude <span class="term">applet</span>, <span class="term">audio</span>, <span class="term">canvas</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">object</span>, <span class="term">script</span>&#160;and <span class="term">video</span>; see <a href="#s3.6">section 3.6</a>.<br />
1045<br /> 1045<br />
1046&#160; When <span class="term">$config["elements"]</span>, which specifies allowed elements, is <em>properly</em>&#160;defined, and neither empty nor set to <span class="term">0</span>&#160;or <span class="term">&#42;</span>, the default set is not used. To have elements added to or removed from the default set, a <span class="term">+/-</span>&#160;notation is used. E.g., <span class="term">&#42;-script-object</span>&#160;implies that only <span class="term">script</span>&#160;and <span class="term">object</span>&#160;are disallowed, whereas <span class="term">&#42;+embed</span>&#160;means that <span class="term">noembed</span>&#160;is also allowed. Elements can also be specified as comma separated names. E.g., <span class="term">a, b, i</span>&#160;means only <span class="term">a</span>, <span class="term">b</span>&#160;and <span class="term">i</span>&#160;are permitted. In this notation, <span class="term">&#42;</span>, <span class="term">+</span>&#160;and <span class="term">-</span>&#160;have no significance and can actually cause a mis-reading.<br /> 1046&#160; When <span class="term">$config["elements"]</span>, which specifies allowed elements, is <em>properly</em>&#160;defined, and neither empty nor set to <span class="term">0</span>&#160;or <span class="term">&#42;</span>, the default set is not used. To have elements added to or removed from the default set, a <span class="term">+/-</span>&#160;notation is used. E.g., <span class="term">&#42;-script-object</span>&#160;implies that only <span class="term">script</span>&#160;and <span class="term">object</span>&#160;are disallowed, whereas <span class="term">&#42;+embed</span>&#160;means that <span class="term">noembed</span>&#160;is also allowed. Elements can also be specified as comma separated names. E.g., <span class="term">a, b, i</span>&#160;means only <span class="term">a</span>, <span class="term">b</span>&#160;and <span class="term">i</span>&#160;are permitted. In this notation, <span class="term">&#42;</span>, <span class="term">+</span>&#160;and <span class="term">-</span>&#160;have no significance and can actually cause a mis-reading.<br />
1047<br /> 1047<br />
1048&#160; Some more examples of <span class="term">$config["elements"]</span>&#160;values indicating permitted elements (note that empty spaces are liberally allowed for clarity):<br /> 1048&#160; Some more examples of <span class="term">$config["elements"]</span>&#160;values indicating permitted elements (note that empty spaces are liberally allowed for clarity):<br />
1049<br /> 1049<br />
1050&#160; * &#160;<span class="term">a, blockquote, code, em, strong</span>&#160;-- only <span class="term">a</span>, <span class="term">blockquote</span>, <span class="term">code</span>, <span class="term">em</span>, and <span class="term">strong</span><br /> 1050&#160; * &#160;<span class="term">a, blockquote, code, em, strong</span>&#160;-- only <span class="term">a</span>, <span class="term">blockquote</span>, <span class="term">code</span>, <span class="term">em</span>, and <span class="term">strong</span><br />
1051&#160; * &#160;<span class="term">&#42;-script</span>&#160;-- all excluding <span class="term">script</span><br /> 1051&#160; * &#160;<span class="term">&#42;-script</span>&#160;-- all excluding <span class="term">script</span><br />
1052&#160; * &#160;<span class="term">&#42; -acronym -big -center -dir -font -isindex -s -strike -tt</span>&#160;-- only non-obsolete/deprecated elements of HTML5<br /> 1052&#160; * &#160;<span class="term">&#42; -acronym -big -center -dir -font -isindex -s -strike -tt</span>&#160;-- only non-obsolete/deprecated elements of HTML5<br />
1053&#160; * &#160;<span class="term">&#42;+noembed-script</span>&#160;-- all including <span class="term">noembed</span>&#160;excluding <span class="term">script</span><br /> 1053&#160; * &#160;<span class="term">&#42;+noembed-script</span>&#160;-- all including <span class="term">noembed</span>&#160;excluding <span class="term">script</span><br />
1054<br /> 1054<br />
1055&#160; Some mis-usages (and the resulting permitted elements) that can be avoided:<br /> 1055&#160; Some mis-usages (and the resulting permitted elements) that can be avoided:<br />
1056<br /> 1056<br />
1057&#160; * &#160;<span class="term">-&#42;</span>&#160;-- none; instead of htmLawed, one might just use, e.g., the <span class="term">htmlspecialchars()</span>&#160;PHP function<br /> 1057&#160; * &#160;<span class="term">-&#42;</span>&#160;-- none; instead of htmLawed, one might just use, e.g., the <span class="term">htmlspecialchars()</span>&#160;PHP function<br />
1058&#160; * &#160;<span class="term">&#42;, -script</span>&#160;-- all except <span class="term">script</span>; admin probably meant <span class="term">&#42;-script</span><br /> 1058&#160; * &#160;<span class="term">&#42;, -script</span>&#160;-- all except <span class="term">script</span>; admin probably meant <span class="term">&#42;-script</span><br />
1059&#160; * &#160;<span class="term">-&#42;, a, em, strong</span>&#160;-- all; admin probably meant <span class="term">a, em, strong</span><br /> 1059&#160; * &#160;<span class="term">-&#42;, a, em, strong</span>&#160;-- all; admin probably meant <span class="term">a, em, strong</span><br />
1060&#160; * &#160;<span class="term">&#42;</span>&#160;-- all; admin need not have set <span class="term">elements</span><br /> 1060&#160; * &#160;<span class="term">&#42;</span>&#160;-- all; admin need not have set <span class="term">elements</span><br />
1061&#160; * &#160;<span class="term">&#42;-form+form</span>&#160;-- all; a <span class="term">+</span>&#160;will always over-ride any <span class="term">-</span><br /> 1061&#160; * &#160;<span class="term">&#42;-form+form</span>&#160;-- all; a <span class="term">+</span>&#160;will always over-ride any <span class="term">-</span><br />
1062&#160; * &#160;<span class="term">&#42;, noembed</span>&#160;-- only <span class="term">noembed</span>; admin probably meant <span class="term">&#42;+noembed</span><br /> 1062&#160; * &#160;<span class="term">&#42;, noembed</span>&#160;-- only <span class="term">noembed</span>; admin probably meant <span class="term">&#42;+noembed</span><br />
1063&#160; * &#160;<span class="term">a, +b, i</span>&#160;-- only <span class="term">a</span>&#160;and <span class="term">i</span>; admin probably meant <span class="term">a, b, i</span><br /> 1063&#160; * &#160;<span class="term">a, +b, i</span>&#160;-- only <span class="term">a</span>&#160;and <span class="term">i</span>; admin probably meant <span class="term">a, b, i</span><br />
1064<br /> 1064<br />
1065&#160; Basically, when using the <span class="term">+/-</span>&#160;notation, commas (<span class="term">,</span>) should not be used, and vice versa, and <span class="term">&#42;</span>&#160;should be used with the former but not the latter.<br /> 1065&#160; Basically, when using the <span class="term">+/-</span>&#160;notation, commas (<span class="term">,</span>) should not be used, and vice versa, and <span class="term">&#42;</span>&#160;should be used with the former but not the latter.<br />
1066<br /> 1066<br />
1067&#160; <strong>Note</strong>: Even if an element that is not in the default set is allowed through <span class="term">$config["elements"]</span>, like <span class="term">noembed</span>&#160;in the last example, it will eventually be removed during tag balancing unless such balancing is turned off (<span class="term">$config["balance"]</span>&#160;set to <span class="term">0</span>). Currently, the only way around this, which actually is simple, is to edit htmLawed's PHP code which define various arrays in the function <span class="term">hl_bal()</span>&#160;to accommodate the element and its nesting properties.<br /> 1067&#160; <strong>Note</strong>: Even if an element that is not in the default set is allowed through <span class="term">$config["elements"]</span>, like <span class="term">noembed</span>&#160;in the last example, it will eventually be removed during tag balancing unless such balancing is turned off (<span class="term">$config["balance"]</span>&#160;set to <span class="term">0</span>). Currently, the only way around this, which actually is simple, is to edit htmLawed's PHP code which define various arrays in the function <span class="term">hl_bal()</span>&#160;to accommodate the element and its nesting properties.<br />
1068<br /> 1068<br />
1069&#160; A possible second way to specify allowed elements is to set <span class="term">$config["parent"]</span>&#160;to an element name that supposedly will hold the input, and to set <span class="term">$config["balance"]</span>&#160;to <span class="term">1</span>. During tag balancing (see <a href="#s3.3.3">section 3.3.3</a>), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to <span class="term">div</span>&#160;if <span class="term">$config["parent"]</span>&#160;is empty, <span class="term">body</span>, or an element not in htmLawed's default set of 118 elements.<br /> 1069&#160; A possible second way to specify allowed elements is to set <span class="term">$config["parent"]</span>&#160;to an element name that supposedly will hold the input, and to set <span class="term">$config["balance"]</span>&#160;to <span class="term">1</span>. During tag balancing (see <a href="#s3.3.3">section 3.3.3</a>), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to <span class="term">div</span>&#160;if <span class="term">$config["parent"]</span>&#160;is empty, <span class="term">body</span>, or an element not in htmLawed's default set of 118 elements.<br />
1070<br /> 1070<br />
1071&#160; <em>Tag transformation</em>&#160;is possible for improving compliance with HTML standards -- most of the obsolete/deprecated elements of HTML version 5 are converted to valid &#160;ones; see <a href="#s3.3.2">section 3.3.2</a>.<br /> 1071&#160; <em>Tag transformation</em>&#160;is possible for improving compliance with HTML standards -- most of the obsolete/deprecated elements of HTML version 5 are converted to valid &#160;ones; see <a href="#s3.3.2">section 3.3.2</a>.<br />
1072 1072
1073<div class="sub-sub-section"><h4> 1073<div class="sub-sub-section"><h4>
1074<a name="s3.3.1" id="s3.3.1"></a><span class="item-no">3.3.1</span>&#160; Handling of comments &amp; CDATA sections 1074<a name="s3.3.1" id="s3.3.1"></a><span class="item-no">3.3.1</span>&#160; Handling of comments &amp; CDATA sections
1075</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1075</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1076<br /> 1076<br />
1077&#160; <span class="term">CDATA</span>&#160;sections have the format <span class="term">&lt;![CDATA[...anything but not "]]&gt;"...]]&gt;</span>, and HTML comments, <span class="term">&lt;!--...anything but not "--&gt;"... --&gt;</span>. Neither HTML comments nor <span class="term">CDATA</span>&#160;sections can reside inside tags. HTML comments can exist anywhere else, but <span class="term">CDATA</span>&#160;sections can exist only where plain text is allowed (e.g., immediately inside <span class="term">td</span>&#160;element content but not immediately inside <span class="term">tr</span>&#160;element content).<br /> 1077&#160; <span class="term">CDATA</span>&#160;sections have the format <span class="term">&lt;![CDATA[...anything but not "]]&gt;"...]]&gt;</span>, and HTML comments, <span class="term">&lt;!--...anything but not "--&gt;"... --&gt;</span>. Neither HTML comments nor <span class="term">CDATA</span>&#160;sections can reside inside tags. HTML comments can exist anywhere else, but <span class="term">CDATA</span>&#160;sections can exist only where plain text is allowed (e.g., immediately inside <span class="term">td</span>&#160;element content but not immediately inside <span class="term">tr</span>&#160;element content).<br />
1078<br /> 1078<br />
1079&#160; htmLawed (function <span class="term">hl_cmtcd()</span>) handles HTML comments or <span class="term">CDATA</span>&#160;sections depending on the values of <span class="term">$config["comment"]</span>&#160;or <span class="term">$config["cdata"]</span>. If <span class="term">0</span>, such markup is not looked for and the text is processed like plain text. If <span class="term">1</span>, it is removed completely. If <span class="term">2</span>, it is preserved but any <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;inside are changed to entities. If <span class="term">3</span>&#160;for <span class="term">$config["cdata"]</span>, or <span class="term">3</span>&#160;or <span class="term">4</span>&#160;for <span class="term">$config["comment"]</span>, they are left as such. When <span class="term">$config["comment"]</span>&#160;is set to <span class="term">4</span>, htmLawed will not force a space character before the <span class="term">--&gt;</span>&#160;comment-closing marker. While such a space is required for standard-compliance, it can corrupt marker code put in HTML by some software (such as Microsoft Outlook).<br /> 1079&#160; htmLawed (function <span class="term">hl_cmtcd()</span>) handles HTML comments or <span class="term">CDATA</span>&#160;sections depending on the values of <span class="term">$config["comment"]</span>&#160;or <span class="term">$config["cdata"]</span>. If <span class="term">0</span>, such markup is not looked for and the text is processed like plain text. If <span class="term">1</span>, it is removed completely. If <span class="term">2</span>, it is preserved but any <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;inside are changed to entities. If <span class="term">3</span>&#160;for <span class="term">$config["cdata"]</span>, or <span class="term">3</span>&#160;or <span class="term">4</span>&#160;for <span class="term">$config["comment"]</span>, they are left as such. When <span class="term">$config["comment"]</span>&#160;is set to <span class="term">4</span>, htmLawed will not force a space character before the <span class="term">--&gt;</span>&#160;comment-closing marker. While such a space is required for standard-compliance, it can corrupt marker code put in HTML by some software (such as Microsoft Outlook).<br />
1080<br /> 1080<br />
1081&#160; Note that for the last two cases, HTML comments and <span class="term">CDATA</span>&#160;sections will always be removed from tag content (function <span class="term">hl_tag()</span>).<br /> 1081&#160; Note that for the last two cases, HTML comments and <span class="term">CDATA</span>&#160;sections will always be removed from tag content (function <span class="term">hl_tag()</span>).<br />
1082<br /> 1082<br />
1083&#160; Examples:<br /> 1083&#160; Examples:<br />
1084<br /> 1084<br />
1085&#160; Input:<br /> 1085&#160; Input:<br />
1086 1086
1087<code class="code">&#160; &#160; &lt;!-- home link--&gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;y]]&gt;Home&lt;/a&gt;</code> 1087<code class="code">&#160; &#160; &lt;!-- home link--&gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;y]]&gt;Home&lt;/a&gt;</code>
1088<br /> 1088<br />
1089&#160; Output (<span class="term">$config["comment"] = 0, $config["cdata"] = 2</span>):<br /> 1089&#160; Output (<span class="term">$config["comment"] = 0, $config["cdata"] = 2</span>):<br />
1090 1090
1091<code class="code">&#160; &#160; &amp;lt;-- home link--&amp;gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;amp;y]]&gt;Home&lt;/a&gt;</code> 1091<code class="code">&#160; &#160; &amp;lt;-- home link--&amp;gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;amp;y]]&gt;Home&lt;/a&gt;</code>
1092<br /> 1092<br />
1093&#160; Output (<span class="term">$config["comment"] = 1, $config["cdata"] = 2</span>):<br /> 1093&#160; Output (<span class="term">$config["comment"] = 1, $config["cdata"] = 2</span>):<br />
1094 1094
1095<code class="code">&#160; &#160; &lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;amp;y]]&gt;Home&lt;/a&gt;</code> 1095<code class="code">&#160; &#160; &lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;amp;y]]&gt;Home&lt;/a&gt;</code>
1096<br /> 1096<br />
1097&#160; Output (<span class="term">$config["comment"] = 2, $config["cdata"] = 2</span>):<br /> 1097&#160; Output (<span class="term">$config["comment"] = 2, $config["cdata"] = 2</span>):<br />
1098 1098
1099<code class="code">&#160; &#160; &lt;!-- home link --&gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;amp;y]]&gt;Home&lt;/a&gt;</code> 1099<code class="code">&#160; &#160; &lt;!-- home link --&gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;amp;y]]&gt;Home&lt;/a&gt;</code>
1100<br /> 1100<br />
1101&#160; Output (<span class="term">$config["comment"] = 2, $config["cdata"] = 1</span>):<br /> 1101&#160; Output (<span class="term">$config["comment"] = 2, $config["cdata"] = 1</span>):<br />
1102 1102
1103<code class="code">&#160; &#160; &lt;!-- home link --&gt;&lt;a href="home.htm"&gt;Home&lt;/a&gt;</code> 1103<code class="code">&#160; &#160; &lt;!-- home link --&gt;&lt;a href="home.htm"&gt;Home&lt;/a&gt;</code>
1104<br /> 1104<br />
1105&#160; Output (<span class="term">$config["comment"] = 3, $config["cdata"] = 3</span>):<br /> 1105&#160; Output (<span class="term">$config["comment"] = 3, $config["cdata"] = 3</span>):<br />
1106 1106
1107<code class="code">&#160; &#160; &lt;!-- home link --&gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;y]]&gt;Home&lt;/a&gt;</code> 1107<code class="code">&#160; &#160; &lt;!-- home link --&gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;y]]&gt;Home&lt;/a&gt;</code>
1108<br /> 1108<br />
1109&#160; Output (<span class="term">$config["comment"] = 4, $config["cdata"] = 3</span>):<br /> 1109&#160; Output (<span class="term">$config["comment"] = 4, $config["cdata"] = 3</span>):<br />
1110 1110
1111<code class="code">&#160; &#160; &lt;!-- home link--&gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;y]]&gt;Home&lt;/a&gt;</code> 1111<code class="code">&#160; &#160; &lt;!-- home link--&gt;&lt;a href="home.htm"&gt;&lt;![CDATA[x=&amp;y]]&gt;Home&lt;/a&gt;</code>
1112<br /> 1112<br />
1113<br /> 1113<br />
1114&#160; For standard-compliance, comments are given the form <span class="term">&lt;!--comment --&gt;</span>, and any <span class="term">--</span>&#160;in the content is made <span class="term">-</span>. When <span class="term">$config["comment"]</span>&#160;is set to <span class="term">4</span>, htmLawed will not force a space character before the <span class="term">--&gt;</span>&#160;comment-closing marker.<br /> 1114&#160; For standard-compliance, comments are given the form <span class="term">&lt;!--comment --&gt;</span>, and any <span class="term">--</span>&#160;in the content is made <span class="term">-</span>. When <span class="term">$config["comment"]</span>&#160;is set to <span class="term">4</span>, htmLawed will not force a space character before the <span class="term">--&gt;</span>&#160;comment-closing marker.<br />
1115<br /> 1115<br />
1116&#160; When <span class="term">$config["safe"] = 1</span>, CDATA sections and comments are considered plain text unless <span class="term">$config["comment"]</span>&#160;or <span class="term">$config["cdata"]</span>&#160;is explicitly specified; see <a href="#s3.6">section 3.6</a>.<br /> 1116&#160; When <span class="term">$config["safe"] = 1</span>, CDATA sections and comments are considered plain text unless <span class="term">$config["comment"]</span>&#160;or <span class="term">$config["cdata"]</span>&#160;is explicitly specified; see <a href="#s3.6">section 3.6</a>.<br />
1117 1117
1118</div> 1118</div>
1119<div class="sub-sub-section"><h4> 1119<div class="sub-sub-section"><h4>
1120<a name="s3.3.2" id="s3.3.2"></a><span class="item-no">3.3.2</span>&#160; Tag-transformation for better compliance with standards 1120<a name="s3.3.2" id="s3.3.2"></a><span class="item-no">3.3.2</span>&#160; Tag-transformation for better compliance with standards
1121</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1121</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1122<br /> 1122<br />
1123&#160; If <span class="term">$config["make_tag_strict"]</span>&#160;is set and not <span class="term">0</span>, following deprecated elements (and attributes), as per HTML 5 specification, even if admin-permitted, are mutated as indicated (element content remains intact; function <span class="term">hl_tag2()</span>):<br /> 1123&#160; If <span class="term">$config["make_tag_strict"]</span>&#160;is set and not <span class="term">0</span>, following deprecated elements (and attributes), as per HTML 5 specification, even if admin-permitted, are mutated as indicated (element content remains intact; function <span class="term">hl_tag2()</span>):<br />
1124<br /> 1124<br />
1125&#160; * &#160;acronym - <span class="term">abbr</span><br /> 1125&#160; * &#160;acronym - <span class="term">abbr</span><br />
1126&#160; * &#160;applet - based on <span class="term">$config["make_tag_strict"]</span>, unchanged (<span class="term">1</span>) or removed (<span class="term">2</span>)<br /> 1126&#160; * &#160;applet - based on <span class="term">$config["make_tag_strict"]</span>, unchanged (<span class="term">1</span>) or removed (<span class="term">2</span>)<br />
1127&#160; * &#160;big - <span class="term">span style="font-size&#58; larger;"</span><br /> 1127&#160; * &#160;big - <span class="term">span style="font-size&#58; larger;"</span><br />
1128&#160; * &#160;center - <span class="term">div style="text-align&#58; center;"</span><br /> 1128&#160; * &#160;center - <span class="term">div style="text-align&#58; center;"</span><br />
1129&#160; * &#160;dir - <span class="term">ul</span><br /> 1129&#160; * &#160;dir - <span class="term">ul</span><br />
1130&#160; * &#160;font (face, size, color) - &#160; &#160;<span class="term">span style="font-family&#58; ; font-size&#58; ; color&#58; ;"</span>&#160;(size transformation <a href="http://web.archive.org/web/20180201141931/http://style.cleverchimp.com/font_size_intervals/altintervals.html">reference</a>)<br /> 1130&#160; * &#160;font (face, size, color) - &#160; &#160;<span class="term">span style="font-family&#58; ; font-size&#58; ; color&#58; ;"</span>&#160;(size transformation <a href="http://web.archive.org/web/20180201141931/http://style.cleverchimp.com/font_size_intervals/altintervals.html">reference</a>)<br />
1131&#160; * &#160;isindex - based on <span class="term">$config["make_tag_strict"]</span>, unchanged (<span class="term">1</span>) or removed (<span class="term">2</span>)<br /> 1131&#160; * &#160;isindex - based on <span class="term">$config["make_tag_strict"]</span>, unchanged (<span class="term">1</span>) or removed (<span class="term">2</span>)<br />
1132&#160; * &#160;s - <span class="term">span style="text-decoration&#58; line-through;"</span><br /> 1132&#160; * &#160;s - <span class="term">span style="text-decoration&#58; line-through;"</span><br />
1133&#160; * &#160;strike - <span class="term">span style="text-decoration&#58; line-through;"</span><br /> 1133&#160; * &#160;strike - <span class="term">span style="text-decoration&#58; line-through;"</span><br />
1134&#160; * &#160;tt - <span class="term">code</span><br /> 1134&#160; * &#160;tt - <span class="term">code</span><br />
1135<br /> 1135<br />
1136&#160; For an element with a pre-existing <span class="term">style</span>&#160;attribute value, the extra style properties are appended.<br /> 1136&#160; For an element with a pre-existing <span class="term">style</span>&#160;attribute value, the extra style properties are appended.<br />
1137<br /> 1137<br />
1138&#160; Example input:<br /> 1138&#160; Example input:<br />
1139<br /> 1139<br />
1140 1140
1141<code class="code">&#160; &#160; &lt;center&gt;</code> 1141<code class="code">&#160; &#160; &lt;center&gt;</code>
1142<br /> 1142<br />
1143 1143
1144<code class="code">&#160; &#160; &#160;The PHP &lt;s&gt;software&lt;/s&gt; script used for this &lt;strike&gt;web-page&lt;/strike&gt; web-page is &lt;font style="font-weight&#58; bold " face=arial size=&#39;+3&#39; color &#160; = &#160;"red &#160;"&gt;htmLawedTest.php&lt;/font&gt;, from &lt;u style= &#39;color&#58;green&#39;&gt;PHP Labware&lt;/u&gt;.</code> 1144<code class="code">&#160; &#160; &#160;The PHP &lt;s&gt;software&lt;/s&gt; script used for this &lt;strike&gt;web-page&lt;/strike&gt; web-page is &lt;font style="font-weight&#58; bold " face=arial size=&#39;+3&#39; color &#160; = &#160;"red &#160;"&gt;htmLawedTest.php&lt;/font&gt;, from &lt;u style= &#39;color&#58;green&#39;&gt;PHP Labware&lt;/u&gt;.</code>
1145<br /> 1145<br />
1146 1146
1147<code class="code">&#160; &#160; &lt;/center&gt;</code> 1147<code class="code">&#160; &#160; &lt;/center&gt;</code>
1148<br /> 1148<br />
1149<br /> 1149<br />
1150&#160; The output:<br /> 1150&#160; The output:<br />
1151<br /> 1151<br />
1152 1152
1153<code class="code">&#160; &#160; &lt;div style="text-align&#58; center;"&gt;</code> 1153<code class="code">&#160; &#160; &lt;div style="text-align&#58; center;"&gt;</code>
1154<br /> 1154<br />
1155 1155
1156<code class="code">&#160; &#160; &#160;The PHP &lt;span style="text-decoration&#58; line-through;"&gt;software&lt;/span&gt; script used for this &lt;span style="text-decoration&#58; line-through;"&gt;web-page&lt;/span&gt; web-page is &lt;span style="font-weight&#58; bold; font-size&#58; 200%; color&#58; red; font-family&#58; arial;"&gt;htmLawedTest.php&lt;/span&gt;, from &lt;u style="color&#58;green"&gt;PHP Labware&lt;/u&gt;.</code> 1156<code class="code">&#160; &#160; &#160;The PHP &lt;span style="text-decoration&#58; line-through;"&gt;software&lt;/span&gt; script used for this &lt;span style="text-decoration&#58; line-through;"&gt;web-page&lt;/span&gt; web-page is &lt;span style="font-weight&#58; bold; font-size&#58; 200%; color&#58; red; font-family&#58; arial;"&gt;htmLawedTest.php&lt;/span&gt;, from &lt;u style="color&#58;green"&gt;PHP Labware&lt;/u&gt;.</code>
1157<br /> 1157<br />
1158 1158
1159<code class="code">&#160; &#160; &lt;/div&gt;</code> 1159<code class="code">&#160; &#160; &lt;/div&gt;</code>
1160<br /> 1160<br />
1161 1161
1162</div> 1162</div>
1163<div class="sub-sub-section"><h4> 1163<div class="sub-sub-section"><h4>
1164<a name="s3.3.3" id="s3.3.3"></a><span class="item-no">3.3.3</span>&#160; Tag balancing &amp; proper nesting 1164<a name="s3.3.3" id="s3.3.3"></a><span class="item-no">3.3.3</span>&#160; Tag balancing &amp; proper nesting
1165</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1165</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1166<br /> 1166<br />
1167&#160; If <span class="term">$config["balance"]</span>&#160;is set to <span class="term">1</span>, htmLawed (function <span class="term">hl_bal()</span>) checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them).<br /> 1167&#160; If <span class="term">$config["balance"]</span>&#160;is set to <span class="term">1</span>, htmLawed (function <span class="term">hl_bal()</span>) checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them).<br />
1168<br /> 1168<br />
1169&#160; Depending on the value of <span class="term">$config["keep_bad"]</span>&#160;(see <a href="#s2.2">section 2.2</a>&#160;and <a href="#s3.3">section 3.3</a>), illegal content may be removed or neutralized to plain text by converting &lt; and &gt; to entities:<br /> 1169&#160; Depending on the value of <span class="term">$config["keep_bad"]</span>&#160;(see <a href="#s2.2">section 2.2</a>&#160;and <a href="#s3.3">section 3.3</a>), illegal content may be removed or neutralized to plain text by converting &lt; and &gt; to entities:<br />
1170<br /> 1170<br />
1171&#160; <span class="term">0</span>&#160;- remove; this option is available only to maintain Kses-compatibility and should not be used otherwise (see <a href="#s2.6">section 2.6</a>)<br /> 1171&#160; <span class="term">0</span>&#160;- remove; this option is available only to maintain Kses-compatibility and should not be used otherwise (see <a href="#s2.6">section 2.6</a>)<br />
1172&#160; <span class="term">1</span>&#160;- neutralize tags and keep element content<br /> 1172&#160; <span class="term">1</span>&#160;- neutralize tags and keep element content<br />
1173&#160; <span class="term">2</span>&#160;- remove tags but keep element content<br /> 1173&#160; <span class="term">2</span>&#160;- remove tags but keep element content<br />
1174&#160; <span class="term">3</span>&#160;and <span class="term">4</span>&#160;- like <span class="term">1</span>&#160;and <span class="term">2</span>, but keep element content only if text (<span class="term">pcdata</span>) is valid in parent element as per specs<br /> 1174&#160; <span class="term">3</span>&#160;and <span class="term">4</span>&#160;- like <span class="term">1</span>&#160;and <span class="term">2</span>, but keep element content only if text (<span class="term">pcdata</span>) is valid in parent element as per specs<br />
1175&#160; <span class="term">5</span>&#160;and <span class="term">6</span>&#160;- &#160;like <span class="term">3</span>&#160;and <span class="term">4</span>, but line-breaks, tabs and spaces are left<br /> 1175&#160; <span class="term">5</span>&#160;and <span class="term">6</span>&#160;- &#160;like <span class="term">3</span>&#160;and <span class="term">4</span>, but line-breaks, tabs and spaces are left<br />
1176<br /> 1176<br />
1177&#160; Example input (disallowing the <span class="term">p</span>&#160;element):<br /> 1177&#160; Example input (disallowing the <span class="term">p</span>&#160;element):<br />
1178<br /> 1178<br />
1179 1179
1180<code class="code">&#160; &#160; &lt;&#42;&gt; Pseudo-tags &lt;&#42;&gt;</code> 1180<code class="code">&#160; &#160; &lt;&#42;&gt; Pseudo-tags &lt;&#42;&gt;</code>
1181<br /> 1181<br />
1182 1182
1183<code class="code">&#160; &#160; &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt;</code> 1183<code class="code">&#160; &#160; &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt;</code>
1184<br /> 1184<br />
1185 1185
1186<code class="code">&#160; &#160; &lt;p&gt;</code> 1186<code class="code">&#160; &#160; &lt;p&gt;</code>
1187<br /> 1187<br />
1188 1188
1189<code class="code">&#160; &#160; Disallowed tag p</code> 1189<code class="code">&#160; &#160; Disallowed tag p</code>
1190<br /> 1190<br />
1191 1191
1192<code class="code">&#160; &#160; &lt;/p&gt;</code> 1192<code class="code">&#160; &#160; &lt;/p&gt;</code>
1193<br /> 1193<br />
1194 1194
1195<code class="code">&#160; &#160; &lt;ul&gt;Bad&lt;li&gt;OK&lt;/li&gt;&lt;/ul&gt;</code> 1195<code class="code">&#160; &#160; &lt;ul&gt;Bad&lt;li&gt;OK&lt;/li&gt;&lt;/ul&gt;</code>
1196<br /> 1196<br />
1197<br /> 1197<br />
1198&#160; The output with <span class="term">$config["keep_bad"] = 1</span>:<br /> 1198&#160; The output with <span class="term">$config["keep_bad"] = 1</span>:<br />
1199<br /> 1199<br />
1200 1200
1201<code class="code">&#160; &#160; &amp;lt;&#42;&amp;gt; Pseudo-tags &amp;lt;&#42;&amp;gt;</code> 1201<code class="code">&#160; &#160; &amp;lt;&#42;&amp;gt; Pseudo-tags &amp;lt;&#42;&amp;gt;</code>
1202<br /> 1202<br />
1203 1203
1204<code class="code">&#160; &#160; &amp;lt;xml&amp;gt;Non-HTML tag xml&amp;lt;/xml&amp;gt;</code> 1204<code class="code">&#160; &#160; &amp;lt;xml&amp;gt;Non-HTML tag xml&amp;lt;/xml&amp;gt;</code>
1205<br /> 1205<br />
1206 1206
1207<code class="code">&#160; &#160; &amp;lt;p&amp;gt;</code> 1207<code class="code">&#160; &#160; &amp;lt;p&amp;gt;</code>
1208<br /> 1208<br />
1209 1209
1210<code class="code">&#160; &#160; Disallowed tag p</code> 1210<code class="code">&#160; &#160; Disallowed tag p</code>
1211<br /> 1211<br />
1212 1212
1213<code class="code">&#160; &#160; &amp;lt;/p&amp;gt;</code> 1213<code class="code">&#160; &#160; &amp;lt;/p&amp;gt;</code>
1214<br /> 1214<br />
1215 1215
1216<code class="code">&#160; &#160; &lt;ul&gt;Bad&lt;li&gt;OK&lt;/li&gt;&lt;/ul&gt;</code> 1216<code class="code">&#160; &#160; &lt;ul&gt;Bad&lt;li&gt;OK&lt;/li&gt;&lt;/ul&gt;</code>
1217<br /> 1217<br />
1218<br /> 1218<br />
1219&#160; The output with <span class="term">$config["keep_bad"] = 3</span>:<br /> 1219&#160; The output with <span class="term">$config["keep_bad"] = 3</span>:<br />
1220<br /> 1220<br />
1221 1221
1222<code class="code">&#160; &#160; &amp;lt;&#42;&amp;gt; Pseudo-tags &amp;lt;&#42;&amp;gt;</code> 1222<code class="code">&#160; &#160; &amp;lt;&#42;&amp;gt; Pseudo-tags &amp;lt;&#42;&amp;gt;</code>
1223<br /> 1223<br />
1224 1224
1225<code class="code">&#160; &#160; &amp;lt;xml&amp;gt;Non-HTML tag xml&amp;lt;/xml&amp;gt;</code> 1225<code class="code">&#160; &#160; &amp;lt;xml&amp;gt;Non-HTML tag xml&amp;lt;/xml&amp;gt;</code>
1226<br /> 1226<br />
1227 1227
1228<code class="code">&#160; &#160; &amp;lt;p&amp;gt;</code> 1228<code class="code">&#160; &#160; &amp;lt;p&amp;gt;</code>
1229<br /> 1229<br />
1230 1230
1231<code class="code">&#160; &#160; Disallowed tag p</code> 1231<code class="code">&#160; &#160; Disallowed tag p</code>
1232<br /> 1232<br />
1233 1233
1234<code class="code">&#160; &#160; &amp;lt;/p&amp;gt;</code> 1234<code class="code">&#160; &#160; &amp;lt;/p&amp;gt;</code>
1235<br /> 1235<br />
1236 1236
1237<code class="code">&#160; &#160; &lt;ul&gt;&lt;li&gt;OK&lt;/li&gt;&lt;/ul&gt;</code> 1237<code class="code">&#160; &#160; &lt;ul&gt;&lt;li&gt;OK&lt;/li&gt;&lt;/ul&gt;</code>
1238<br /> 1238<br />
1239<br /> 1239<br />
1240&#160; The output with <span class="term">$config["keep_bad"] = 6</span>:<br /> 1240&#160; The output with <span class="term">$config["keep_bad"] = 6</span>:<br />
1241<br /> 1241<br />
1242 1242
1243<code class="code">&#160; &#160; &amp;lt;&#42;&amp;gt; Pseudo-tags &amp;lt;&#42;&amp;gt;</code> 1243<code class="code">&#160; &#160; &amp;lt;&#42;&amp;gt; Pseudo-tags &amp;lt;&#42;&amp;gt;</code>
1244<br /> 1244<br />
1245 1245
1246<code class="code">&#160; &#160; Non-HTML tag xml</code> 1246<code class="code">&#160; &#160; Non-HTML tag xml</code>
1247<br /> 1247<br />
1248<br /> 1248<br />
1249 1249
1250<code class="code">&#160; &#160; Disallowed tag p</code> 1250<code class="code">&#160; &#160; Disallowed tag p</code>
1251<br /> 1251<br />
1252<br /> 1252<br />
1253 1253
1254<code class="code">&#160; &#160; &lt;ul&gt;&lt;li&gt;OK&lt;/li&gt;&lt;/ul&gt;</code> 1254<code class="code">&#160; &#160; &lt;ul&gt;&lt;li&gt;OK&lt;/li&gt;&lt;/ul&gt;</code>
1255<br /> 1255<br />
1256<br /> 1256<br />
1257&#160; An option like <span class="term">1</span>&#160;is useful, e.g., when a writer previews his submission, whereas one like <span class="term">3</span>&#160;is useful before content is finalized and made available to all.<br /> 1257&#160; An option like <span class="term">1</span>&#160;is useful, e.g., when a writer previews his submission, whereas one like <span class="term">3</span>&#160;is useful before content is finalized and made available to all.<br />
1258<br /> 1258<br />
1259&#160; <strong>Note:</strong>&#160;In the example above, unlike <span class="term">&lt;&#42;&gt;</span>, <span class="term">&lt;xml&gt;</span>&#160;gets considered as a tag (even though there is no HTML element named <span class="term">xml</span>). Thus, the <span class="term">keep_bad</span>&#160;parameter's value affects <span class="term">&lt;xml&gt;</span>&#160;but not <span class="term">&lt;&#42;&gt;</span>. In general, text matching the regular expression pattern <span class="term">&lt;(/?)([a-zA-Z][a-zA-Z1-6]&#42;)([^&gt;]&#42;?)\s?&gt;</span>&#160;is considered a tag (phrase enclosed by the angled brackets <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...), and is subjected to the <span class="term">keep_bad</span>&#160;value.<br /> 1259&#160; <strong>Note:</strong>&#160;In the example above, unlike <span class="term">&lt;&#42;&gt;</span>, <span class="term">&lt;xml&gt;</span>&#160;gets considered as a tag (even though there is no HTML element named <span class="term">xml</span>). Thus, the <span class="term">keep_bad</span>&#160;parameter's value affects <span class="term">&lt;xml&gt;</span>&#160;but not <span class="term">&lt;&#42;&gt;</span>. In general, text matching the regular expression pattern <span class="term">&lt;(/?)([a-zA-Z][a-zA-Z1-6]&#42;)([^&gt;]&#42;?)\s?&gt;</span>&#160;is considered a tag (phrase enclosed by the angled brackets <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>, and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...), and is subjected to the <span class="term">keep_bad</span>&#160;value.<br />
1260<br /> 1260<br />
1261&#160; Nesting/content rules for each of the 118 elements in htmLawed's default set (see <a href="#s3.3">section 3.3</a>) are defined in function <span class="term">hl_bal()</span>. This means that if a non-standard element besides <span class="term">embed</span>&#160;is being permitted through <span class="term">$config["elements"]</span>, the element's tag content will end up getting removed if <span class="term">$config["balance"]</span>&#160;is set to <span class="term">1</span>.<br /> 1261&#160; Nesting/content rules for each of the 118 elements in htmLawed's default set (see <a href="#s3.3">section 3.3</a>) are defined in function <span class="term">hl_bal()</span>. This means that if a non-standard element besides <span class="term">embed</span>&#160;is being permitted through <span class="term">$config["elements"]</span>, the element's tag content will end up getting removed if <span class="term">$config["balance"]</span>&#160;is set to <span class="term">1</span>.<br />
1262<br /> 1262<br />
1263&#160; Plain text and/or certain elements nested inside <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span>&#160;need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as <span class="term">form</span>, the input <span class="term">B&#58;&lt;input type="text" value="b" /&gt;C&#58;&lt;input type="text" value="c" /&gt;</span>&#160;is converted to <span class="term">&lt;div&gt;B&#58;&lt;input type="text" value="b" /&gt;C&#58;&lt;input type="text" value="c" /&gt;&lt;/div&gt;</span>.<br /> 1263&#160; Plain text and/or certain elements nested inside <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span>&#160;need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as <span class="term">form</span>, the input <span class="term">B&#58;&lt;input type="text" value="b" /&gt;C&#58;&lt;input type="text" value="c" /&gt;</span>&#160;is converted to <span class="term">&lt;div&gt;B&#58;&lt;input type="text" value="b" /&gt;C&#58;&lt;input type="text" value="c" /&gt;&lt;/div&gt;</span>.<br />
1264 1264
1265</div> 1265</div>
1266<div class="sub-sub-section"><h4> 1266<div class="sub-sub-section"><h4>
1267<a name="s3.3.4" id="s3.3.4"></a><span class="item-no">3.3.4</span>&#160; Elements requiring child elements 1267<a name="s3.3.4" id="s3.3.4"></a><span class="item-no">3.3.4</span>&#160; Elements requiring child elements
1268</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1268</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1269<br /> 1269<br />
1270&#160; As per HTML specifications, elements such as those below require legal child elements nested inside them:<br /> 1270&#160; As per HTML specifications, elements such as those below require legal child elements nested inside them:<br />
1271<br /> 1271<br />
1272 1272
1273<code class="code">&#160; &#160; blockquote, dir, dl, form, map, menu, noscript, ol, optgroup, rbc, rtc, ruby, select, table, tbody, tfoot, thead, tr, ul</code> 1273<code class="code">&#160; &#160; blockquote, dir, dl, form, map, menu, noscript, ol, optgroup, rbc, rtc, ruby, select, table, tbody, tfoot, thead, tr, ul</code>
1274<br /> 1274<br />
1275<br /> 1275<br />
1276&#160; In some cases, the specifications stipulate the number and/or the ordering of the child elements. A <span class="term">table</span>&#160;can have 0 or 1 <span class="term">caption</span>, <span class="term">tbody</span>, <span class="term">tfoot</span>, and <span class="term">thead</span>, but they must be in this order: <span class="term">caption</span>, <span class="term">thead</span>, <span class="term">tfoot</span>, <span class="term">tbody</span>.<br /> 1276&#160; In some cases, the specifications stipulate the number and/or the ordering of the child elements. A <span class="term">table</span>&#160;can have 0 or 1 <span class="term">caption</span>, <span class="term">tbody</span>, <span class="term">tfoot</span>, and <span class="term">thead</span>, but they must be in this order: <span class="term">caption</span>, <span class="term">thead</span>, <span class="term">tfoot</span>, <span class="term">tbody</span>.<br />
1277<br /> 1277<br />
1278&#160; htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages.<br /> 1278&#160; htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages.<br />
1279<br /> 1279<br />
1280&#160; With <span class="term">$config["direct_list_nest"]</span>&#160;set to <span class="term">1</span>, htmLawed will allow direct nesting of <span class="term">ol</span>, <span class="term">ul</span>, or <span class="term">menu</span>&#160;list within another <span class="term">ol</span>, <span class="term">ul</span>, or <span class="term">menu</span>&#160;without requiring the child list to be within an <span class="term">li</span>&#160;of the parent list. While this may not be standard-compliant, directly nested lists are rendered properly by almost all browsers. The parameter <span class="term">$config["direct_list_nest"]</span>&#160;has no effect if tag balancing (<a href="#s3.3.3">section 3.3.3</a>) is turned off.<br /> 1280&#160; With <span class="term">$config["direct_list_nest"]</span>&#160;set to <span class="term">1</span>, htmLawed will allow direct nesting of <span class="term">ol</span>, <span class="term">ul</span>, or <span class="term">menu</span>&#160;list within another <span class="term">ol</span>, <span class="term">ul</span>, or <span class="term">menu</span>&#160;without requiring the child list to be within an <span class="term">li</span>&#160;of the parent list. While this may not be standard-compliant, directly nested lists are rendered properly by almost all browsers. The parameter <span class="term">$config["direct_list_nest"]</span>&#160;has no effect if tag balancing (<a href="#s3.3.3">section 3.3.3</a>) is turned off.<br />
1281 1281
1282</div> 1282</div>
1283<div class="sub-sub-section"><h4> 1283<div class="sub-sub-section"><h4>
1284<a name="s3.3.5" id="s3.3.5"></a><span class="item-no">3.3.5</span>&#160; Beautify or compact HTML 1284<a name="s3.3.5" id="s3.3.5"></a><span class="item-no">3.3.5</span>&#160; Beautify or compact HTML
1285</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1285</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1286<br /> 1286<br />
1287&#160; By default, htmLawed will neither <em>beautify</em>&#160;HTML code by formatting it with indentations, etc., nor will it make it compact by removing un-needed white-space.(It does always properly white-space tag content.)<br /> 1287&#160; By default, htmLawed will neither <em>beautify</em>&#160;HTML code by formatting it with indentations, etc., nor will it make it compact by removing un-needed white-space.(It does always properly white-space tag content.)<br />
1288<br /> 1288<br />
1289&#160; As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside <span class="term">pre</span>&#160;elements) are all considered equivalent, and referred to as <em>white-spaces</em>. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space <em>normalization</em>&#160;allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such <em>pretty</em>&#160;HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome.<br /> 1289&#160; As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside <span class="term">pre</span>&#160;elements) are all considered equivalent, and referred to as <em>white-spaces</em>. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space <em>normalization</em>&#160;allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such <em>pretty</em>&#160;HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome.<br />
1290<br /> 1290<br />
1291&#160; With the <span class="term">$config</span>&#160;parameter <span class="term">tidy</span>, htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides <span class="term">pre</span>, the <span class="term">script</span>&#160;and <span class="term">textarea</span>&#160;elements, CDATA sections, and HTML comments are not subjected to the tidying process.<br /> 1291&#160; With the <span class="term">$config</span>&#160;parameter <span class="term">tidy</span>, htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides <span class="term">pre</span>, the <span class="term">script</span>&#160;and <span class="term">textarea</span>&#160;elements, CDATA sections, and HTML comments are not subjected to the tidying process.<br />
1292<br /> 1292<br />
1293&#160; To <em>compact</em>, use <span class="term">$config["tidy"] = -1</span>; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed.<br /> 1293&#160; To <em>compact</em>, use <span class="term">$config["tidy"] = -1</span>; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed.<br />
1294<br /> 1294<br />
1295&#160; To <em>beautify</em>, <span class="term">$config["tidy"]</span>&#160;is set as <span class="term">1</span>, or for customized tidying, as a string like <span class="term">2s2n</span>. The <span class="term">s</span>&#160;or <span class="term">t</span>&#160;character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The <span class="term">r</span>&#160;and <span class="term">n</span>&#160;characters are used to specify line-break characters: <span class="term">n</span>&#160;for <span class="term">\n</span>&#160;(Unix/Mac OS X line-breaks), <span class="term">rn</span>&#160;or <span class="term">nr</span>&#160;for <span class="term">\r\n</span>&#160;(Windows/DOS line-breaks), or <span class="term">r</span>&#160;for <span class="term">\r</span>.<br /> 1295&#160; To <em>beautify</em>, <span class="term">$config["tidy"]</span>&#160;is set as <span class="term">1</span>, or for customized tidying, as a string like <span class="term">2s2n</span>. The <span class="term">s</span>&#160;or <span class="term">t</span>&#160;character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The <span class="term">r</span>&#160;and <span class="term">n</span>&#160;characters are used to specify line-break characters: <span class="term">n</span>&#160;for <span class="term">\n</span>&#160;(Unix/Mac OS X line-breaks), <span class="term">rn</span>&#160;or <span class="term">nr</span>&#160;for <span class="term">\r\n</span>&#160;(Windows/DOS line-breaks), or <span class="term">r</span>&#160;for <span class="term">\r</span>.<br />
1296<br /> 1296<br />
1297&#160; The <span class="term">$config["tidy"]</span>&#160;value of <span class="term">1</span>&#160;is equivalent to <span class="term">2s0n</span>. Other <span class="term">$config["tidy"]</span>&#160;values are read loosely: a value of <span class="term">4</span>&#160;is equivalent to <span class="term">4s0n</span>; <span class="term">t2</span>, to <span class="term">1t2n</span>; <span class="term">s</span>, to <span class="term">2s0n</span>; <span class="term">2TR</span>, to <span class="term">2t0r</span>; <span class="term">T1</span>, to <span class="term">1t1n</span>; <span class="term">nr3</span>, to <span class="term">3s0nr</span>, and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification.<br /> 1297&#160; The <span class="term">$config["tidy"]</span>&#160;value of <span class="term">1</span>&#160;is equivalent to <span class="term">2s0n</span>. Other <span class="term">$config["tidy"]</span>&#160;values are read loosely: a value of <span class="term">4</span>&#160;is equivalent to <span class="term">4s0n</span>; <span class="term">t2</span>, to <span class="term">1t2n</span>; <span class="term">s</span>, to <span class="term">2s0n</span>; <span class="term">2TR</span>, to <span class="term">2t0r</span>; <span class="term">T1</span>, to <span class="term">1t1n</span>; <span class="term">nr3</span>, to <span class="term">3s0nr</span>, and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification.<br />
1298<br /> 1298<br />
1299&#160; Input formatting using <span class="term">$config["tidy"]</span>&#160;is not recommended when input text has mixed markup (like HTML + PHP).<br /> 1299&#160; Input formatting using <span class="term">$config["tidy"]</span>&#160;is not recommended when input text has mixed markup (like HTML + PHP).<br />
1300 1300
1301</div> 1301</div>
1302<div class="sub-section"><h3> 1302<div class="sub-section"><h3>
1303<a name="s3.4" id="s3.4"></a><span class="item-no">3.4</span>&#160; Attributes 1303<a name="s3.4" id="s3.4"></a><span class="item-no">3.4</span>&#160; Attributes
1304</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1304</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1305<br /> 1305<br />
1306&#160; In its default setting, htmLawed will only permit attributes described in the HTML specifications (including deprecated ones). A list of the attributes and the elements they are allowed in is in <a href="#s5.2">section 5.2</a>. Using the <span class="term">$spec</span>&#160;argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (<a href="#s2.3">section 2.3</a>).<br /> 1306&#160; In its default setting, htmLawed will only permit attributes described in the HTML specifications (including deprecated ones). A list of the attributes and the elements they are allowed in is in <a href="#s5.2">section 5.2</a>. Using the <span class="term">$spec</span>&#160;argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (<a href="#s2.3">section 2.3</a>).<br />
1307<br /> 1307<br />
1308&#160; Custom <em>data-*</em>&#160;(<em>data-star</em>) attributes, where the first three characters of the value of <em>star</em>&#160;(*) after lower-casing do not equal <span class="term">xml</span>, and the value of <em>star</em>&#160;does not have a colon (:), equal-to (=), newline, solidus (/), space or tab character, or any upper-case A-Z character are allowed in all elements. ARIA, event and microdata attributes like <span class="term">aria-live</span>, <span class="term">onclick</span>&#160;and <span class="term">itemid</span>&#160;are also considered global attributes (<a href="#s5.2">section 5.2</a>).<br /> 1308&#160; Custom <em>data-*</em>&#160;(<em>data-star</em>) attributes, where the first three characters of the value of <em>star</em>&#160;(*) after lower-casing do not equal <span class="term">xml</span>, and the value of <em>star</em>&#160;does not have a colon (:), equal-to (=), newline, solidus (/), space or tab character, or any upper-case A-Z character are allowed in all elements. ARIA, event and microdata attributes like <span class="term">aria-live</span>, <span class="term">onclick</span>&#160;and <span class="term">itemid</span>&#160;are also considered global attributes (<a href="#s5.2">section 5.2</a>).<br />
1309<br /> 1309<br />
1310&#160; When <span class="term">$config["deny_attribute"]</span>&#160;is not set, or set to <span class="term">0</span>, or empty (<span class="term">""</span>), all attributes are permitted. Otherwise, <span class="term">$config["deny_attribute"]</span>&#160;can be set as a list of comma-separated names of the denied attributes. <span class="term">on&#42;</span>&#160;can be used to refer to the group of potentially dangerous, script-accepting event attributes like <span class="term">onblur</span>&#160;and <span class="term">onchange</span>&#160;that have <span class="term">on</span>&#160;at the beginning of their names. Similarly, <span class="term">aria&#42;</span>&#160;and <span class="term">data&#42;</span>&#160;can be used to respectively refer to the set of all ARIA and data-* attributes.<br /> 1310&#160; When <span class="term">$config["deny_attribute"]</span>&#160;is not set, or set to <span class="term">0</span>, or empty (<span class="term">""</span>), all attributes are permitted. Otherwise, <span class="term">$config["deny_attribute"]</span>&#160;can be set as a list of comma-separated names of the denied attributes. <span class="term">on&#42;</span>&#160;can be used to refer to the group of potentially dangerous, script-accepting event attributes like <span class="term">onblur</span>&#160;and <span class="term">onchange</span>&#160;that have <span class="term">on</span>&#160;at the beginning of their names. Similarly, <span class="term">aria&#42;</span>&#160;and <span class="term">data&#42;</span>&#160;can be used to respectively refer to the set of all ARIA and data-* attributes.<br />
1311<br /> 1311<br />
1312&#160; With <span class="term">$config["safe"] = 1</span>&#160;(<a href="#s3.6">section 3.6</a>), the <span class="term">on&#42;</span>&#160;event attributes are automatically disallowed even if a value for <span class="term">$config["deny_attribute"]</span>&#160;has been manually provided.<br /> 1312&#160; With <span class="term">$config["safe"] = 1</span>&#160;(<a href="#s3.6">section 3.6</a>), the <span class="term">on&#42;</span>&#160;event attributes are automatically disallowed even if a value for <span class="term">$config["deny_attribute"]</span>&#160;has been manually provided.<br />
1313<br /> 1313<br />
1314&#160; Note that attributes specified in <span class="term">$config["deny_attribute"]</span>&#160;are denied globally, for all elements. To deny attributes for only specific elements, <span class="term">$spec</span>&#160;(see <a href="#s2.3">section 2.3</a>) can be used. <span class="term">$spec</span>&#160;can also be used to element-specifically permit an attribute otherwise denied through <span class="term">$config["deny_attribute"]</span>.<br /> 1314&#160; Note that attributes specified in <span class="term">$config["deny_attribute"]</span>&#160;are denied globally, for all elements. To deny attributes for only specific elements, <span class="term">$spec</span>&#160;(see <a href="#s2.3">section 2.3</a>) can be used. <span class="term">$spec</span>&#160;can also be used to element-specifically permit an attribute otherwise denied through <span class="term">$config["deny_attribute"]</span>.<br />
1315<br /> 1315<br />
1316&#160; Finer restrictions on attributes can also be put into effect through <span class="term">$config["deny_attribute"]</span>&#160;(<a href="3.4.9">section</a>).<br /> 1316&#160; Finer restrictions on attributes can also be put into effect through <span class="term">$config["deny_attribute"]</span>&#160;(<a href="3.4.9">section</a>).<br />
1317<br /> 1317<br />
1318&#160; <strong>Note</strong>: To deny all but a few attributes globally, a simpler way to specify <span class="term">$config["deny_attribute"]</span>&#160;would be to use the notation <span class="term">&#42; -attribute1 -attribute2 ...</span>. Thus, a value of <span class="term">&#42; -title -href</span>&#160;implies that except <span class="term">href</span>&#160;and <span class="term">title</span>&#160;(where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter <span class="term">safe</span>&#160;(<a href="#s3.6">section 3.6</a>) will have no effect on <span class="term">deny_attribute</span>. Values of <span class="term">aria&#42;</span>&#160;<span class="term">data&#42;</span>, and <span class="term">on&#42;</span>&#160;cannot be used in this notation to refer to the sets of all ARIA, data-*, and on* attributes respectively.<br /> 1318&#160; <strong>Note</strong>: To deny all but a few attributes globally, a simpler way to specify <span class="term">$config["deny_attribute"]</span>&#160;would be to use the notation <span class="term">&#42; -attribute1 -attribute2 ...</span>. Thus, a value of <span class="term">&#42; -title -href</span>&#160;implies that except <span class="term">href</span>&#160;and <span class="term">title</span>&#160;(where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter <span class="term">safe</span>&#160;(<a href="#s3.6">section 3.6</a>) will have no effect on <span class="term">deny_attribute</span>. Values of <span class="term">aria&#42;</span>&#160;<span class="term">data&#42;</span>, and <span class="term">on&#42;</span>&#160;cannot be used in this notation to refer to the sets of all ARIA, data-*, and on* attributes respectively.<br />
1319<br /> 1319<br />
1320&#160; htmLawed (function <span class="term">hl_tag()</span>) also:<br /> 1320&#160; htmLawed (function <span class="term">hl_tag()</span>) also:<br />
1321<br /> 1321<br />
1322&#160; * &#160;Lower-cases attribute names<br /> 1322&#160; * &#160;Lower-cases attribute names<br />
1323&#160; * &#160;Removes duplicate attributes (last one stays)<br /> 1323&#160; * &#160;Removes duplicate attributes (last one stays)<br />
1324&#160; * &#160;Gives attributes the form <span class="term">name="value"</span>&#160;and single-spaces them, removing unnecessary white-spacing<br /> 1324&#160; * &#160;Gives attributes the form <span class="term">name="value"</span>&#160;and single-spaces them, removing unnecessary white-spacing<br />
1325&#160; * &#160;Provides <em>required</em>&#160;attributes (see <a href="#s3.4.1">section 3.4.1</a>)<br /> 1325&#160; * &#160;Provides <em>required</em>&#160;attributes (see <a href="#s3.4.1">section 3.4.1</a>)<br />
1326&#160; * &#160;Double-quotes values and escapes any <span class="term">"</span>&#160;inside them<br /> 1326&#160; * &#160;Double-quotes values and escapes any <span class="term">"</span>&#160;inside them<br />
1327&#160; * &#160;Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point <span class="term">ad</span>) in the values with spaces<br /> 1327&#160; * &#160;Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point <span class="term">ad</span>) in the values with spaces<br />
1328&#160; * &#160;Allows custom function to additionally filter/modify attribute values (see <a href="#s3.4.9">section 3.4.9</a>)<br /> 1328&#160; * &#160;Allows custom function to additionally filter/modify attribute values (see <a href="#s3.4.9">section 3.4.9</a>)<br />
1329 1329
1330<div class="sub-sub-section"><h4> 1330<div class="sub-sub-section"><h4>
1331<a name="s3.4.1" id="s3.4.1"></a><span class="item-no">3.4.1</span>&#160; Auto-addition of XHTML-required attributes 1331<a name="s3.4.1" id="s3.4.1"></a><span class="item-no">3.4.1</span>&#160; Auto-addition of XHTML-required attributes
1332</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1332</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1333<br /> 1333<br />
1334&#160; If indicated attributes for the following elements are found missing, htmLawed (function <span class="term">hl_tag()</span>) will add them (with values same as attribute names unless indicated otherwise below):<br /> 1334&#160; If indicated attributes for the following elements are found missing, htmLawed (function <span class="term">hl_tag()</span>) will add them (with values same as attribute names unless indicated otherwise below):<br />
1335<br /> 1335<br />
1336&#160; * &#160;area - alt (<span class="term">area</span>)<br /> 1336&#160; * &#160;area - alt (<span class="term">area</span>)<br />
1337&#160; * &#160;area, img - src, alt (<span class="term">image</span>)<br /> 1337&#160; * &#160;area, img - src, alt (<span class="term">image</span>)<br />
1338&#160; * &#160;bdo - dir (<span class="term">ltr</span>)<br /> 1338&#160; * &#160;bdo - dir (<span class="term">ltr</span>)<br />
1339&#160; * &#160;form - action<br /> 1339&#160; * &#160;form - action<br />
1340&#160; * &#160;label - command<br /> 1340&#160; * &#160;label - command<br />
1341&#160; * &#160;map - name<br /> 1341&#160; * &#160;map - name<br />
1342&#160; * &#160;optgroup - label<br /> 1342&#160; * &#160;optgroup - label<br />
1343&#160; * &#160;param - name<br /> 1343&#160; * &#160;param - name<br />
1344&#160; * &#160;style - scoped<br /> 1344&#160; * &#160;style - scoped<br />
1345&#160; * &#160;textarea - rows (<span class="term">10</span>), cols (<span class="term">50</span>)<br /> 1345&#160; * &#160;textarea - rows (<span class="term">10</span>), cols (<span class="term">50</span>)<br />
1346<br /> 1346<br />
1347&#160; Additionally, with <span class="term">$config["xml&#58;lang"]</span>&#160;set to <span class="term">1</span>&#160;or <span class="term">2</span>, if the <span class="term">lang</span>&#160;but not the <span class="term">xml&#58;lang</span>&#160;attribute is declared, then the latter is added too, with a value copied from that of <span class="term">lang</span>. This is for better standard-compliance. With <span class="term">$config["xml&#58;lang"]</span>&#160;set to <span class="term">2</span>, the <span class="term">lang</span>&#160;attribute is removed (XHTML specification).<br /> 1347&#160; Additionally, with <span class="term">$config["xml&#58;lang"]</span>&#160;set to <span class="term">1</span>&#160;or <span class="term">2</span>, if the <span class="term">lang</span>&#160;but not the <span class="term">xml&#58;lang</span>&#160;attribute is declared, then the latter is added too, with a value copied from that of <span class="term">lang</span>. This is for better standard-compliance. With <span class="term">$config["xml&#58;lang"]</span>&#160;set to <span class="term">2</span>, the <span class="term">lang</span>&#160;attribute is removed (XHTML specification).<br />
1348<br /> 1348<br />
1349&#160; Note that the <span class="term">name</span>&#160;attribute for <span class="term">map</span>, invalid in XHTML, is also transformed if required -- see <a href="#s3.4.6">section 3.4.6</a>.<br /> 1349&#160; Note that the <span class="term">name</span>&#160;attribute for <span class="term">map</span>, invalid in XHTML, is also transformed if required -- see <a href="#s3.4.6">section 3.4.6</a>.<br />
1350 1350
1351</div> 1351</div>
1352<div class="sub-sub-section"><h4> 1352<div class="sub-sub-section"><h4>
1353<a name="s3.4.2" id="s3.4.2"></a><span class="item-no">3.4.2</span>&#160; Duplicate/invalid <span class="term">id</span>&#160;values 1353<a name="s3.4.2" id="s3.4.2"></a><span class="item-no">3.4.2</span>&#160; Duplicate/invalid <span class="term">id</span>&#160;values
1354</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1354</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1355<br /> 1355<br />
1356&#160; If <span class="term">$config["unique_ids"]</span>&#160;is <span class="term">1</span>, htmLawed (function <span class="term">hl_tag()</span>) removes <span class="term">id</span>&#160;attributes with values that are not standards-compliant (must not have a space character) or duplicate. If <span class="term">$config["unique_ids"]</span>&#160;is a word (without a non-word character like space), any duplicate but otherwise valid value will be appropriately prefixed with the word to ensure its uniqueness.<br /> 1356&#160; If <span class="term">$config["unique_ids"]</span>&#160;is <span class="term">1</span>, htmLawed (function <span class="term">hl_tag()</span>) removes <span class="term">id</span>&#160;attributes with values that are not standards-compliant (must not have a space character) or duplicate. If <span class="term">$config["unique_ids"]</span>&#160;is a word (without a non-word character like space), any duplicate but otherwise valid value will be appropriately prefixed with the word to ensure its uniqueness.<br />
1357<br /> 1357<br />
1358&#160; Even if multiple inputs need to be filtered (through multiple calls to htmLawed), htmLawed ensures uniqueness of <span class="term">id</span>&#160;values as it uses a global variable (<span class="term">$GLOBALS["hl_Ids"]</span>&#160;array). Further, an admin can restrict the use of certain <span class="term">id</span>&#160;values by presetting this variable before htmLawed is called into use. E.g.:<br /> 1358&#160; Even if multiple inputs need to be filtered (through multiple calls to htmLawed), htmLawed ensures uniqueness of <span class="term">id</span>&#160;values as it uses a global variable (<span class="term">$GLOBALS["hl_Ids"]</span>&#160;array). Further, an admin can restrict the use of certain <span class="term">id</span>&#160;values by presetting this variable before htmLawed is called into use. E.g.:<br />
1359<br /> 1359<br />
1360 1360
1361<code class="code">&#160; &#160; $GLOBALS[&#39;hl_Ids&#39;] = array(&#39;top&#39;=&gt;1, &#39;bottom&#39;=&gt;1, &#39;myform&#39;=&gt;1); // id values not allowed in input</code> 1361<code class="code">&#160; &#160; $GLOBALS[&#39;hl_Ids&#39;] = array(&#39;top&#39;=&gt;1, &#39;bottom&#39;=&gt;1, &#39;myform&#39;=&gt;1); // id values not allowed in input</code>
1362<br /> 1362<br />
1363 1363
1364<code class="code">&#160; &#160; $processed = htmLawed($text); // filter input</code> 1364<code class="code">&#160; &#160; $processed = htmLawed($text); // filter input</code>
1365<br /> 1365<br />
1366 1366
1367</div> 1367</div>
1368<div class="sub-sub-section"><h4> 1368<div class="sub-sub-section"><h4>
1369<a name="s3.4.3" id="s3.4.3"></a><span class="item-no">3.4.3</span>&#160; URL schemes &amp; scripts in attribute values 1369<a name="s3.4.3" id="s3.4.3"></a><span class="item-no">3.4.3</span>&#160; URL schemes &amp; scripts in attribute values
1370</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1370</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1371<br /> 1371<br />
1372&#160; htmLawed edits attributes that take URLs as values if they are found to contain un-permitted schemes. E.g., if the <span class="term">afp</span>&#160;scheme is not permitted, then <span class="term">&lt;a href="afp&#58;//domain.org"&gt;</span>&#160;becomes <span class="term">&lt;a href="denied&#58;afp&#58;//domain.org"&gt;</span>, and if Javascript is not permitted <span class="term">&lt;a onclick="javascript&#58;xss();"&gt;</span>&#160;becomes <span class="term">&lt;a onclick="denied&#58;javascript&#58;xss();"&gt;</span>.<br /> 1372&#160; htmLawed edits attributes that take URLs as values if they are found to contain un-permitted schemes. E.g., if the <span class="term">afp</span>&#160;scheme is not permitted, then <span class="term">&lt;a href="afp&#58;//domain.org"&gt;</span>&#160;becomes <span class="term">&lt;a href="denied&#58;afp&#58;//domain.org"&gt;</span>, and if Javascript is not permitted <span class="term">&lt;a onclick="javascript&#58;xss();"&gt;</span>&#160;becomes <span class="term">&lt;a onclick="denied&#58;javascript&#58;xss();"&gt;</span>.<br />
1373<br /> 1373<br />
1374&#160; By default htmLawed permits these schemes in URLs for the <span class="term">href</span>&#160;attribute:<br /> 1374&#160; By default htmLawed permits these schemes in URLs for the <span class="term">href</span>&#160;attribute:<br />
1375<br /> 1375<br />
1376 1376
1377<code class="code">&#160; &#160; aim, app, feed, file, ftp, gopher, http, https, javascript, irc, mailto, news, nntp, sftp, ssh, tel, telnet</code> 1377<code class="code">&#160; &#160; aim, app, feed, file, ftp, gopher, http, https, javascript, irc, mailto, news, nntp, sftp, ssh, tel, telnet</code>
1378<br /> 1378<br />
1379<br /> 1379<br />
1380&#160; Also, only <span class="term">data</span>, <span class="term">file</span>, <span class="term">http</span>, <span class="term">https</span>&#160;and <span class="term">javascript</span>&#160;are permitted in these attributes that accept URLs:<br /> 1380&#160; Also, only <span class="term">data</span>, <span class="term">file</span>, <span class="term">http</span>, <span class="term">https</span>&#160;and <span class="term">javascript</span>&#160;are permitted in these attributes that accept URLs:<br />
1381<br /> 1381<br />
1382 1382
1383<code class="code">&#160; &#160; action, cite, classid, codebase, data, itemtype, longdesc, model, pluginspage, pluginurl, src, srcset, style, usemap, and event attributes like onclick</code> 1383<code class="code">&#160; &#160; action, cite, classid, codebase, data, itemtype, longdesc, model, pluginspage, pluginurl, src, srcset, style, usemap, and event attributes like onclick</code>
1384<br /> 1384<br />
1385<br /> 1385<br />
1386&#160; With <span class="term">$config["safe"] = 1</span>&#160;(<a href="#s3.6">section 3.6</a>), the above is changed to disallow <span class="term">app</span>, <span class="term">data</span>&#160;and <span class="term">javascript</span>.<br /> 1386&#160; With <span class="term">$config["safe"] = 1</span>&#160;(<a href="#s3.6">section 3.6</a>), the above is changed to disallow <span class="term">app</span>, <span class="term">data</span>&#160;and <span class="term">javascript</span>.<br />
1387<br /> 1387<br />
1388&#160; These default sets are used when <span class="term">$config["schemes"]</span>&#160;is not set (see <a href="#s2.2">section 2.2</a>). To over-ride the defaults, <span class="term">$config["schemes"]</span>&#160;is defined as a string of semi-colon-separated sub-strings of type <span class="term">attribute&#58; comma-separated schemes</span>. E.g., <span class="term">href&#58; mailto, http, https; onclick&#58; javascript; src&#58; http, https</span>. For unspecified attributes, <span class="term">data</span>, <span class="term">file</span>, <span class="term">http</span>, <span class="term">https</span>&#160;and <span class="term">javascript</span>&#160;are permitted. This can be changed by passing schemes for <span class="term">&#42;</span>&#160;in <span class="term">$config["schemes"]</span>. E.g., <span class="term">href&#58; mailto, http, https; &#42;&#58; https, https</span>.<br /> 1388&#160; These default sets are used when <span class="term">$config["schemes"]</span>&#160;is not set (see <a href="#s2.2">section 2.2</a>). To over-ride the defaults, <span class="term">$config["schemes"]</span>&#160;is defined as a string of semi-colon-separated sub-strings of type <span class="term">attribute&#58; comma-separated schemes</span>. E.g., <span class="term">href&#58; mailto, http, https; onclick&#58; javascript; src&#58; http, https</span>. For unspecified attributes, <span class="term">data</span>, <span class="term">file</span>, <span class="term">http</span>, <span class="term">https</span>&#160;and <span class="term">javascript</span>&#160;are permitted. This can be changed by passing schemes for <span class="term">&#42;</span>&#160;in <span class="term">$config["schemes"]</span>. E.g., <span class="term">href&#58; mailto, http, https; &#42;&#58; https, https</span>.<br />
1389<br /> 1389<br />
1390&#160; <span class="term">&#42;</span>&#160;(asterisk) can be put in the list of schemes to permit all protocols. E.g., <span class="term">style&#58; &#42;; img&#58; http, https</span>&#160;results in protocols not being checked in <span class="term">style</span>&#160;attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (<a href="#s3.4.4">section 3.4.4</a>) is not done. When an attribute is explicitly listed in <span class="term">$config["schemes"]</span>, then filtering is dictated by the setting for the attribute, with no effect of the setting for asterisk. That is, the set of attributes that asterisk refers to no longer includes the listed attribute.<br /> 1390&#160; <span class="term">&#42;</span>&#160;(asterisk) can be put in the list of schemes to permit all protocols. E.g., <span class="term">style&#58; &#42;; img&#58; http, https</span>&#160;results in protocols not being checked in <span class="term">style</span>&#160;attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (<a href="#s3.4.4">section 3.4.4</a>) is not done. When an attribute is explicitly listed in <span class="term">$config["schemes"]</span>, then filtering is dictated by the setting for the attribute, with no effect of the setting for asterisk. That is, the set of attributes that asterisk refers to no longer includes the listed attribute.<br />
1391<br /> 1391<br />
1392&#160; Thus, <em>to allow the xmpp scheme</em>, one can set <span class="term">$config["schemes"]</span>&#160;as <span class="term">href&#58; mailto, http, https; &#42;&#58; http, https, xmpp</span>, or <span class="term">href&#58; mailto, http, https, xmpp; &#42;&#58; http, https, xmpp</span>, or <span class="term">&#42;&#58; &#42;</span>, and so on. The consequence of each of these example values will be different (e.g., only the last two but not the first will allow <span class="term">xmpp</span>&#160;in <span class="term">href</span>)<br /> 1392&#160; Thus, <em>to allow the xmpp scheme</em>, one can set <span class="term">$config["schemes"]</span>&#160;as <span class="term">href&#58; mailto, http, https; &#42;&#58; http, https, xmpp</span>, or <span class="term">href&#58; mailto, http, https, xmpp; &#42;&#58; http, https, xmpp</span>, or <span class="term">&#42;&#58; &#42;</span>, and so on. The consequence of each of these example values will be different (e.g., only the last two but not the first will allow <span class="term">xmpp</span>&#160;in <span class="term">href</span>)<br />
1393<br /> 1393<br />
1394&#160; As a side-note, one may find <span class="term">style&#58; &#42;</span>&#160;useful as URLs in <span class="term">style</span>&#160;attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text.<br /> 1394&#160; As a side-note, one may find <span class="term">style&#58; &#42;</span>&#160;useful as URLs in <span class="term">style</span>&#160;attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text.<br />
1395<br /> 1395<br />
1396&#160; <span class="term">!</span>&#160;can be put in the list of schemes to disallow all protocols as well as <em>local</em>&#160;URLs. Thus, with <span class="term">href&#58; http, style&#58; !</span>, <span class="term">&lt;a href="http&#58;//cnn.com" style="background-image&#58; url(local.jpg);"&gt;CNN&lt;/a&gt;</span>&#160;will become <span class="term">&lt;a href="http&#58;//cnn.com" style="background-image&#58; url(denied&#58;local.jpg);"&gt;CNN&lt;/a&gt;</span><br /> 1396&#160; <span class="term">!</span>&#160;can be put in the list of schemes to disallow all protocols as well as <em>local</em>&#160;URLs. Thus, with <span class="term">href&#58; http, style&#58; !</span>, <span class="term">&lt;a href="http&#58;//cnn.com" style="background-image&#58; url(local.jpg);"&gt;CNN&lt;/a&gt;</span>&#160;will become <span class="term">&lt;a href="http&#58;//cnn.com" style="background-image&#58; url(denied&#58;local.jpg);"&gt;CNN&lt;/a&gt;</span><br />
1397<br /> 1397<br />
1398&#160; <strong>Note</strong>: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string <span class="term">src</span>&#160;(e.g., <span class="term">dynsrc</span>) or starts with <span class="term">o</span>&#160;(e.g., <span class="term">onbeforecopy</span>).<br /> 1398&#160; <strong>Note</strong>: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string <span class="term">src</span>&#160;(e.g., <span class="term">dynsrc</span>) or starts with <span class="term">o</span>&#160;(e.g., <span class="term">onbeforecopy</span>).<br />
1399<br /> 1399<br />
1400&#160; With <span class="term">$config["safe"] = 1</span>, all URLs are disallowed in the <span class="term">style</span>&#160;attribute values.<br /> 1400&#160; With <span class="term">$config["safe"] = 1</span>, all URLs are disallowed in the <span class="term">style</span>&#160;attribute values.<br />
1401 1401
1402</div> 1402</div>
1403<div class="sub-sub-section"><h4> 1403<div class="sub-sub-section"><h4>
1404<a name="s3.4.4" id="s3.4.4"></a><span class="item-no">3.4.4</span>&#160; Absolute &amp; relative URLs in attribute values 1404<a name="s3.4.4" id="s3.4.4"></a><span class="item-no">3.4.4</span>&#160; Absolute &amp; relative URLs in attribute values
1405</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1405</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1406<br /> 1406<br />
1407&#160; htmLawed can make absolute URLs in attributes like <span class="term">href</span>&#160;relative (<span class="term">$config["abs_url"]</span>&#160;is <span class="term">-1</span>), and vice versa (<span class="term">$config["abs_url"]</span>&#160;is <span class="term">1</span>). URLs in scripts are not considered for this, and so are URLs like <span class="term">#section_6</span>&#160;(fragment), <span class="term">?name=Tim#show</span>&#160;(starting with query string), and <span class="term">;var=1?name=Tim#show</span>&#160;(starting with parameters). Further, this requires that <span class="term">$config["base_url"]</span>&#160;be set properly, with the <span class="term">&#58;//</span>&#160;and a trailing slash (<span class="term">/</span>), with no query string, etc. E.g., <span class="term">file&#58;///D&#58;/page/</span>, <span class="term">https&#58;//abc.com/x/y/</span>, or <span class="term">http&#58;//localhost/demo/</span>&#160;are okay, but <span class="term">file&#58;///D&#58;/page/?help=1</span>, <span class="term">abc.com/x/y/</span>&#160;and <span class="term">http&#58;//localhost/demo/index.htm</span>&#160;are not.<br /> 1407&#160; htmLawed can make absolute URLs in attributes like <span class="term">href</span>&#160;relative (<span class="term">$config["abs_url"]</span>&#160;is <span class="term">-1</span>), and vice versa (<span class="term">$config["abs_url"]</span>&#160;is <span class="term">1</span>). URLs in scripts are not considered for this, and so are URLs like <span class="term">#section_6</span>&#160;(fragment), <span class="term">?name=Tim#show</span>&#160;(starting with query string), and <span class="term">;var=1?name=Tim#show</span>&#160;(starting with parameters). Further, this requires that <span class="term">$config["base_url"]</span>&#160;be set properly, with the <span class="term">&#58;//</span>&#160;and a trailing slash (<span class="term">/</span>), with no query string, etc. E.g., <span class="term">file&#58;///D&#58;/page/</span>, <span class="term">https&#58;//abc.com/x/y/</span>, or <span class="term">http&#58;//localhost/demo/</span>&#160;are okay, but <span class="term">file&#58;///D&#58;/page/?help=1</span>, <span class="term">abc.com/x/y/</span>&#160;and <span class="term">http&#58;//localhost/demo/index.htm</span>&#160;are not.<br />
1408<br /> 1408<br />
1409&#160; For making absolute URLs relative, only those URLs that have the <span class="term">$config["base_url"]</span>&#160;string at the beginning are converted. E.g., with <span class="term">$config["base_url"] = "https&#58;//abc.com/x/y/"</span>, <span class="term">https&#58;//abc.com/x/y/a.gif</span>&#160;and <span class="term">https&#58;//abc.com/x/y/z/b.gif</span>&#160;become <span class="term">a.gif</span>&#160;and <span class="term">z/b.gif</span>&#160;respectively, while <span class="term">https&#58;//abc.com/x/c.gif</span>&#160;is not changed.<br /> 1409&#160; For making absolute URLs relative, only those URLs that have the <span class="term">$config["base_url"]</span>&#160;string at the beginning are converted. E.g., with <span class="term">$config["base_url"] = "https&#58;//abc.com/x/y/"</span>, <span class="term">https&#58;//abc.com/x/y/a.gif</span>&#160;and <span class="term">https&#58;//abc.com/x/y/z/b.gif</span>&#160;become <span class="term">a.gif</span>&#160;and <span class="term">z/b.gif</span>&#160;respectively, while <span class="term">https&#58;//abc.com/x/c.gif</span>&#160;is not changed.<br />
1410<br /> 1410<br />
1411&#160; When making relative URLs absolute, only values for scheme, network location (host-name) and path values in the base URL are inherited. See <a href="#s5.5">section 5.5</a>&#160;for more about the URL specification as per RFC <a href="http://www.ietf.org/rfc/rfc1808.txt">1808</a>.<br /> 1411&#160; When making relative URLs absolute, only values for scheme, network location (host-name) and path values in the base URL are inherited. See <a href="#s5.5">section 5.5</a>&#160;for more about the URL specification as per RFC <a href="http://www.ietf.org/rfc/rfc1808.txt">1808</a>.<br />
1412 1412
1413</div> 1413</div>
1414<div class="sub-sub-section"><h4> 1414<div class="sub-sub-section"><h4>
1415<a name="s3.4.5" id="s3.4.5"></a><span class="item-no">3.4.5</span>&#160; Lower-cased, standard attribute values 1415<a name="s3.4.5" id="s3.4.5"></a><span class="item-no">3.4.5</span>&#160; Lower-cased, standard attribute values
1416</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1416</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1417<br /> 1417<br />
1418&#160; Optionally, for standard-compliance, htmLawed (function <span class="term">hl_tag()</span>) lower-cases standard attribute values to give, e.g., <span class="term">input type="password"</span>&#160;instead of <span class="term">input type="Password"</span>, if <span class="term">$config["lc_std_val"]</span>&#160;is <span class="term">1</span>. Attribute values matching those listed below for any of the elements listed further below (plus those for the <span class="term">type</span>&#160;attribute of <span class="term">button</span>&#160;or <span class="term">input</span>) are lower-cased:<br /> 1418&#160; Optionally, for standard-compliance, htmLawed (function <span class="term">hl_tag()</span>) lower-cases standard attribute values to give, e.g., <span class="term">input type="password"</span>&#160;instead of <span class="term">input type="Password"</span>, if <span class="term">$config["lc_std_val"]</span>&#160;is <span class="term">1</span>. Attribute values matching those listed below for any of the elements listed further below (plus those for the <span class="term">type</span>&#160;attribute of <span class="term">button</span>&#160;or <span class="term">input</span>) are lower-cased:<br />
1419<br /> 1419<br />
1420 1420
1421<code class="code">&#160; &#160; all, auto, baseline, bottom, button, captions, center, chapters, char, checkbox, circle, col, colgroup, color, cols, data, date, datetime, datetime-local, default, descriptions, email, file, get, groups, hidden, image, justify, left, ltr, metadata, middle, month, none, number, object, password, poly, post, preserve, radio, range, rect, ref, reset, right, row, rowgroup, rows, rtl, search, submit, subtitles, tel, text, time, top, url, week</code> 1421<code class="code">&#160; &#160; all, auto, baseline, bottom, button, captions, center, chapters, char, checkbox, circle, col, colgroup, color, cols, data, date, datetime, datetime-local, default, descriptions, email, file, get, groups, hidden, image, justify, left, ltr, metadata, middle, month, none, number, object, password, poly, post, preserve, radio, range, rect, ref, reset, right, row, rowgroup, rows, rtl, search, submit, subtitles, tel, text, time, top, url, week</code>
1422<br /> 1422<br />
1423<br /> 1423<br />
1424 1424
1425<code class="code">&#160; &#160; a, area, bdo, button, col, fieldset, form, img, input, object, ol, optgroup, option, param, script, select, table, td, textarea, tfoot, th, thead, tr, track, xml&#58;space</code> 1425<code class="code">&#160; &#160; a, area, bdo, button, col, fieldset, form, img, input, object, ol, optgroup, option, param, script, select, table, td, textarea, tfoot, th, thead, tr, track, xml&#58;space</code>
1426<br /> 1426<br />
1427<br /> 1427<br />
1428&#160; The following <em>empty</em>&#160;(<em>minimized</em>) attributes are always assigned lower-cased values (same as the attribute names):<br /> 1428&#160; The following <em>empty</em>&#160;(<em>minimized</em>) attributes are always assigned lower-cased values (same as the attribute names):<br />
1429<br /> 1429<br />
1430 1430
1431<code class="code">&#160; &#160; checkbox, checked, command, compact, declare, defer, default, disabled, hidden, inert, ismap, itemscope, multiple, nohref, noresize, noshade, nowrap, open, radio, readonly, required, reversed, selected</code> 1431<code class="code">&#160; &#160; checkbox, checked, command, compact, declare, defer, default, disabled, hidden, inert, ismap, itemscope, multiple, nohref, noresize, noshade, nowrap, open, radio, readonly, required, reversed, selected</code>
1432<br /> 1432<br />
1433 1433
1434</div> 1434</div>
1435<div class="sub-sub-section"><h4> 1435<div class="sub-sub-section"><h4>
1436<a name="s3.4.6" id="s3.4.6"></a><span class="item-no">3.4.6</span>&#160; Transformation of deprecated attributes 1436<a name="s3.4.6" id="s3.4.6"></a><span class="item-no">3.4.6</span>&#160; Transformation of deprecated attributes
1437</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1437</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1438<br /> 1438<br />
1439&#160; If <span class="term">$config["no_deprecated_attr"]</span>&#160;is <span class="term">0</span>, then deprecated attributes are removed and, in most cases, their values are transformed to CSS style properties and added to the <span class="term">style</span>&#160;attributes (function <span class="term">hl_tag()</span>). Except for <span class="term">bordercolor</span>&#160;for <span class="term">table</span>, <span class="term">tr</span>&#160;and <span class="term">td</span>, the scores of proprietary attributes that were never part of any cross-browser standard are not supported in this functionality.<br /> 1439&#160; If <span class="term">$config["no_deprecated_attr"]</span>&#160;is <span class="term">0</span>, then deprecated attributes are removed and, in most cases, their values are transformed to CSS style properties and added to the <span class="term">style</span>&#160;attributes (function <span class="term">hl_tag()</span>). Except for <span class="term">bordercolor</span>&#160;for <span class="term">table</span>, <span class="term">tr</span>&#160;and <span class="term">td</span>, the scores of proprietary attributes that were never part of any cross-browser standard are not supported in this functionality.<br />
1440<br /> 1440<br />
1441&#160; * &#160;align in caption, div, h, h2, h3, h4, h5, h6, hr, img, input, legend, object, p, table - for <span class="term">img</span>&#160;with value of <span class="term">left</span>&#160;or <span class="term">right</span>, becomes, e.g., <span class="term">float&#58; left</span>; for <span class="term">div</span>&#160;and <span class="term">table</span>&#160;with value <span class="term">center</span>, becomes <span class="term">margin&#58; auto</span>; all others become, e.g., <span class="term">text-align&#58; right</span><br /> 1441&#160; * &#160;align in caption, div, h, h2, h3, h4, h5, h6, hr, img, input, legend, object, p, table - for <span class="term">img</span>&#160;with value of <span class="term">left</span>&#160;or <span class="term">right</span>, becomes, e.g., <span class="term">float&#58; left</span>; for <span class="term">div</span>&#160;and <span class="term">table</span>&#160;with value <span class="term">center</span>, becomes <span class="term">margin&#58; auto</span>; all others become, e.g., <span class="term">text-align&#58; right</span><br />
1442&#160; * &#160;bgcolor in table, td, th and tr - E.g., <span class="term">bgcolor="#ffffff"</span>&#160;becomes <span class="term">background-color&#58; #ffffff</span><br /> 1442&#160; * &#160;bgcolor in table, td, th and tr - E.g., <span class="term">bgcolor="#ffffff"</span>&#160;becomes <span class="term">background-color&#58; #ffffff</span><br />
1443&#160; * &#160;border in object - E.g., <span class="term">height="10"</span>&#160;becomes <span class="term">height&#58; 10px</span><br /> 1443&#160; * &#160;border in object - E.g., <span class="term">height="10"</span>&#160;becomes <span class="term">height&#58; 10px</span><br />
1444&#160; * &#160;bordercolor in table, td and tr - E.g., <span class="term">bordercolor=#999999</span>&#160;becomes <span class="term">border-color&#58; #999999;</span><br /> 1444&#160; * &#160;bordercolor in table, td and tr - E.g., <span class="term">bordercolor=#999999</span>&#160;becomes <span class="term">border-color&#58; #999999;</span><br />
1445&#160; * &#160;compact in dl, ol and ul - <span class="term">font-size&#58; 85%</span><br /> 1445&#160; * &#160;compact in dl, ol and ul - <span class="term">font-size&#58; 85%</span><br />
1446&#160; * &#160;cellspacing in table - <span class="term">cellspacing="10"</span>&#160;becomes <span class="term">border-spacing&#58; 10px</span><br /> 1446&#160; * &#160;cellspacing in table - <span class="term">cellspacing="10"</span>&#160;becomes <span class="term">border-spacing&#58; 10px</span><br />
1447&#160; * &#160;clear in br - E.g., 'clear="all" becomes <span class="term">clear&#58; both</span><br /> 1447&#160; * &#160;clear in br - E.g., 'clear="all" becomes <span class="term">clear&#58; both</span><br />
1448&#160; * &#160;height in td and th - E.g., <span class="term">height= "10"</span>&#160;becomes <span class="term">height&#58; 10px</span>&#160;and <span class="term">height="&#42;"</span>&#160;becomes <span class="term">height&#58; auto</span><br /> 1448&#160; * &#160;height in td and th - E.g., <span class="term">height= "10"</span>&#160;becomes <span class="term">height&#58; 10px</span>&#160;and <span class="term">height="&#42;"</span>&#160;becomes <span class="term">height&#58; auto</span><br />
1449&#160; * &#160;hspace in img and object - E.g., <span class="term">hspace="10"</span>&#160;becomes <span class="term">margin-left&#58; 10px; margin-right&#58; 10px</span><br /> 1449&#160; * &#160;hspace in img and object - E.g., <span class="term">hspace="10"</span>&#160;becomes <span class="term">margin-left&#58; 10px; margin-right&#58; 10px</span><br />
1450&#160; * &#160;language in script - <span class="term">language="VBScript"</span>&#160;becomes <span class="term">type="text/vbscript"</span><br /> 1450&#160; * &#160;language in script - <span class="term">language="VBScript"</span>&#160;becomes <span class="term">type="text/vbscript"</span><br />
1451&#160; * &#160;name in a, form, iframe, img and map - E.g., <span class="term">name="xx"</span>&#160;becomes <span class="term">id="xx"</span><br /> 1451&#160; * &#160;name in a, form, iframe, img and map - E.g., <span class="term">name="xx"</span>&#160;becomes <span class="term">id="xx"</span><br />
1452&#160; * &#160;noshade in hr - <span class="term">border-style&#58; none; border&#58; 0; background-color&#58; gray; color&#58; gray</span><br /> 1452&#160; * &#160;noshade in hr - <span class="term">border-style&#58; none; border&#58; 0; background-color&#58; gray; color&#58; gray</span><br />
1453&#160; * &#160;nowrap in td and th - <span class="term">white-space&#58; nowrap</span><br /> 1453&#160; * &#160;nowrap in td and th - <span class="term">white-space&#58; nowrap</span><br />
1454&#160; * &#160;size in hr - E.g., <span class="term">size="10"</span>&#160;becomes <span class="term">height&#58; 10px</span><br /> 1454&#160; * &#160;size in hr - E.g., <span class="term">size="10"</span>&#160;becomes <span class="term">height&#58; 10px</span><br />
1455&#160; * &#160;vspace in img and object - E.g., <span class="term">vspace="10"</span>&#160;becomes <span class="term">margin-top&#58; 10px; margin-bottom&#58; 10px</span><br /> 1455&#160; * &#160;vspace in img and object - E.g., <span class="term">vspace="10"</span>&#160;becomes <span class="term">margin-top&#58; 10px; margin-bottom&#58; 10px</span><br />
1456&#160; * &#160;width in hr, pre, table, td and th - like <span class="term">height</span><br /> 1456&#160; * &#160;width in hr, pre, table, td and th - like <span class="term">height</span><br />
1457<br /> 1457<br />
1458&#160; Example input:<br /> 1458&#160; Example input:<br />
1459<br /> 1459<br />
1460 1460
1461<code class="code">&#160; &#160; &lt;img src="j.gif" alt="image" name="dad&#39;s" /&gt;&lt;img src="k.gif" alt="image" id="dad_off" name="dad" /&gt;</code> 1461<code class="code">&#160; &#160; &lt;img src="j.gif" alt="image" name="dad&#39;s" /&gt;&lt;img src="k.gif" alt="image" id="dad_off" name="dad" /&gt;</code>
1462<br /> 1462<br />
1463 1463
1464<code class="code">&#160; &#160; &lt;br clear="left" /&gt;</code> 1464<code class="code">&#160; &#160; &lt;br clear="left" /&gt;</code>
1465<br /> 1465<br />
1466 1466
1467<code class="code">&#160; &#160; &lt;hr noshade size="1" /&gt;</code> 1467<code class="code">&#160; &#160; &lt;hr noshade size="1" /&gt;</code>
1468<br /> 1468<br />
1469 1469
1470<code class="code">&#160; &#160; &lt;img name="img" src="i.gif" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding&#58;5px;" /&gt;</code> 1470<code class="code">&#160; &#160; &lt;img name="img" src="i.gif" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding&#58;5px;" /&gt;</code>
1471<br /> 1471<br />
1472 1472
1473<code class="code">&#160; &#160; &lt;table width="50em" align="center" bgcolor="red"&gt;</code> 1473<code class="code">&#160; &#160; &lt;table width="50em" align="center" bgcolor="red"&gt;</code>
1474<br /> 1474<br />
1475 1475
1476<code class="code">&#160; &#160; &#160;&lt;tr&gt;</code> 1476<code class="code">&#160; &#160; &#160;&lt;tr&gt;</code>
1477<br /> 1477<br />
1478 1478
1479<code class="code">&#160; &#160; &#160; &lt;td width="20%"&gt;</code> 1479<code class="code">&#160; &#160; &#160; &lt;td width="20%"&gt;</code>
1480<br /> 1480<br />
1481 1481
1482<code class="code">&#160; &#160; &#160; &#160;&lt;div align="center"&gt;</code> 1482<code class="code">&#160; &#160; &#160; &#160;&lt;div align="center"&gt;</code>
1483<br /> 1483<br />
1484 1484
1485<code class="code">&#160; &#160; &#160; &#160; &lt;h3 align="right"&gt;Section&lt;/h3&gt;</code> 1485<code class="code">&#160; &#160; &#160; &#160; &lt;h3 align="right"&gt;Section&lt;/h3&gt;</code>
1486<br /> 1486<br />
1487 1487
1488<code class="code">&#160; &#160; &#160; &#160; &lt;p align="right"&gt;Para&lt;/p&gt;</code> 1488<code class="code">&#160; &#160; &#160; &#160; &lt;p align="right"&gt;Para&lt;/p&gt;</code>
1489<br /> 1489<br />
1490 1490
1491<code class="code">&#160; &#160; &#160; &#160;&lt;/div&gt;</code> 1491<code class="code">&#160; &#160; &#160; &#160;&lt;/div&gt;</code>
1492<br /> 1492<br />
1493 1493
1494<code class="code">&#160; &#160; &#160; &lt;/td&gt;</code> 1494<code class="code">&#160; &#160; &#160; &lt;/td&gt;</code>
1495<br /> 1495<br />
1496 1496
1497<code class="code">&#160; &#160; &#160; &lt;td width="&#42;"&gt;</code> 1497<code class="code">&#160; &#160; &#160; &lt;td width="&#42;"&gt;</code>
1498<br /> 1498<br />
1499 1499
1500<code class="code">&#160; &#160; &#160; &lt;/td&gt;</code> 1500<code class="code">&#160; &#160; &#160; &lt;/td&gt;</code>
1501<br /> 1501<br />
1502 1502
1503<code class="code">&#160; &#160; &#160;&lt;/tr&gt;</code> 1503<code class="code">&#160; &#160; &#160;&lt;/tr&gt;</code>
1504<br /> 1504<br />
1505 1505
1506<code class="code">&#160; &#160; &lt;/table&gt;</code> 1506<code class="code">&#160; &#160; &lt;/table&gt;</code>
1507<br /> 1507<br />
1508 1508
1509<code class="code">&#160; &#160; &lt;br clear="all" /&gt;</code> 1509<code class="code">&#160; &#160; &lt;br clear="all" /&gt;</code>
1510<br /> 1510<br />
1511<br /> 1511<br />
1512&#160; And the output with <span class="term">$config["no_deprecated_attr"] = 1</span>:<br /> 1512&#160; And the output with <span class="term">$config["no_deprecated_attr"] = 1</span>:<br />
1513<br /> 1513<br />
1514 1514
1515<code class="code">&#160; &#160; &lt;img src="j.gif" alt="image" id="dad&#39;s" /&gt;&lt;img src="k.gif" alt="image" id="dad_off" /&gt;</code> 1515<code class="code">&#160; &#160; &lt;img src="j.gif" alt="image" id="dad&#39;s" /&gt;&lt;img src="k.gif" alt="image" id="dad_off" /&gt;</code>
1516<br /> 1516<br />
1517 1517
1518<code class="code">&#160; &#160; &lt;br style="clear&#58; left;" /&gt;</code> 1518<code class="code">&#160; &#160; &lt;br style="clear&#58; left;" /&gt;</code>
1519<br /> 1519<br />
1520 1520
1521<code class="code">&#160; &#160; &lt;hr style="border-style&#58; none; border&#58; 0; background-color&#58; gray; color&#58; gray; size&#58; 1px;" /&gt;</code> 1521<code class="code">&#160; &#160; &lt;hr style="border-style&#58; none; border&#58; 0; background-color&#58; gray; color&#58; gray; size&#58; 1px;" /&gt;</code>
1522<br /> 1522<br />
1523 1523
1524<code class="code">&#160; &#160; &lt;img src="i.gif" alt="image" width="10em" height="20" style="padding&#58;5px; float&#58; left; margin-left&#58; 10px; margin-right&#58; 10px; margin-top&#58; 10px; margin-bottom&#58; 10px; border&#58; 1px;" id="img" /&gt;</code> 1524<code class="code">&#160; &#160; &lt;img src="i.gif" alt="image" width="10em" height="20" style="padding&#58;5px; float&#58; left; margin-left&#58; 10px; margin-right&#58; 10px; margin-top&#58; 10px; margin-bottom&#58; 10px; border&#58; 1px;" id="img" /&gt;</code>
1525<br /> 1525<br />
1526 1526
1527<code class="code">&#160; &#160; &lt;table width="50em" style="margin&#58; auto; background-color&#58; red;"&gt;</code> 1527<code class="code">&#160; &#160; &lt;table width="50em" style="margin&#58; auto; background-color&#58; red;"&gt;</code>
1528<br /> 1528<br />
1529 1529
1530<code class="code">&#160; &#160; &#160;&lt;tr&gt;</code> 1530<code class="code">&#160; &#160; &#160;&lt;tr&gt;</code>
1531<br /> 1531<br />
1532 1532
1533<code class="code">&#160; &#160; &#160; &lt;td style="width&#58; 20%;"&gt;</code> 1533<code class="code">&#160; &#160; &#160; &lt;td style="width&#58; 20%;"&gt;</code>
1534<br /> 1534<br />
1535 1535
1536<code class="code">&#160; &#160; &#160; &#160;&lt;div style="margin&#58; auto;"&gt;</code> 1536<code class="code">&#160; &#160; &#160; &#160;&lt;div style="margin&#58; auto;"&gt;</code>
1537<br /> 1537<br />
1538 1538
1539<code class="code">&#160; &#160; &#160; &#160; &lt;h3 style="text-align&#58; right;"&gt;Section&lt;/h3&gt;</code> 1539<code class="code">&#160; &#160; &#160; &#160; &lt;h3 style="text-align&#58; right;"&gt;Section&lt;/h3&gt;</code>
1540<br /> 1540<br />
1541 1541
1542<code class="code">&#160; &#160; &#160; &#160; &lt;p style="text-align&#58; right;"&gt;Para&lt;/p&gt;</code> 1542<code class="code">&#160; &#160; &#160; &#160; &lt;p style="text-align&#58; right;"&gt;Para&lt;/p&gt;</code>
1543<br /> 1543<br />
1544 1544
1545<code class="code">&#160; &#160; &#160; &#160;&lt;/div&gt;</code> 1545<code class="code">&#160; &#160; &#160; &#160;&lt;/div&gt;</code>
1546<br /> 1546<br />
1547 1547
1548<code class="code">&#160; &#160; &#160; &lt;/td&gt;</code> 1548<code class="code">&#160; &#160; &#160; &lt;/td&gt;</code>
1549<br /> 1549<br />
1550 1550
1551<code class="code">&#160; &#160; &#160; &lt;td style="width&#58; auto;"&gt;</code> 1551<code class="code">&#160; &#160; &#160; &lt;td style="width&#58; auto;"&gt;</code>
1552<br /> 1552<br />
1553 1553
1554<code class="code">&#160; &#160; &#160; &lt;/td&gt;</code> 1554<code class="code">&#160; &#160; &#160; &lt;/td&gt;</code>
1555<br /> 1555<br />
1556 1556
1557<code class="code">&#160; &#160; &#160;&lt;/tr&gt;</code> 1557<code class="code">&#160; &#160; &#160;&lt;/tr&gt;</code>
1558<br /> 1558<br />
1559 1559
1560<code class="code">&#160; &#160; &lt;/table&gt;</code> 1560<code class="code">&#160; &#160; &lt;/table&gt;</code>
1561<br /> 1561<br />
1562 1562
1563<code class="code">&#160; &#160; &lt;br style="clear&#58; both;" /&gt;</code> 1563<code class="code">&#160; &#160; &lt;br style="clear&#58; both;" /&gt;</code>
1564<br /> 1564<br />
1565<br /> 1565<br />
1566&#160; For <span class="term">lang</span>, deprecated in XHTML 1.1, transformation is taken care of through <span class="term">$config["xml&#58;lang"]</span>; see <a href="#s3.4.1">section 3.4.1</a>.<br /> 1566&#160; For <span class="term">lang</span>, deprecated in XHTML 1.1, transformation is taken care of through <span class="term">$config["xml&#58;lang"]</span>; see <a href="#s3.4.1">section 3.4.1</a>.<br />
1567<br /> 1567<br />
1568&#160; The attribute <span class="term">name</span>&#160;is deprecated in <span class="term">form</span>, <span class="term">iframe</span>, and <span class="term">img</span>, and is replaced with <span class="term">id</span>&#160;if an <span class="term">id</span>&#160;attribute doesn't exist and if the <span class="term">name</span>&#160;value is appropriate for <span class="term">id</span>&#160;(i.e., doesn't have a non-word character like space). For such replacements for <span class="term">a</span>&#160;and <span class="term">map</span>, for which the <span class="term">name</span>&#160;attribute is deprecated in XHTML 1.1, <span class="term">$config["no_deprecated_attr"]</span>&#160;should be set to <span class="term">2</span>&#160;(when set to <span class="term">1</span>, for these two elements, the <span class="term">name</span>&#160;attribute is retained).<br /> 1568&#160; The attribute <span class="term">name</span>&#160;is deprecated in <span class="term">form</span>, <span class="term">iframe</span>, and <span class="term">img</span>, and is replaced with <span class="term">id</span>&#160;if an <span class="term">id</span>&#160;attribute doesn't exist and if the <span class="term">name</span>&#160;value is appropriate for <span class="term">id</span>&#160;(i.e., doesn't have a non-word character like space). For such replacements for <span class="term">a</span>&#160;and <span class="term">map</span>, for which the <span class="term">name</span>&#160;attribute is deprecated in XHTML 1.1, <span class="term">$config["no_deprecated_attr"]</span>&#160;should be set to <span class="term">2</span>&#160;(when set to <span class="term">1</span>, for these two elements, the <span class="term">name</span>&#160;attribute is retained).<br />
1569 1569
1570</div> 1570</div>
1571<div class="sub-sub-section"><h4> 1571<div class="sub-sub-section"><h4>
1572<a name="s3.4.7" id="s3.4.7"></a><span class="item-no">3.4.7</span>&#160; Anti-spam &amp; <span class="term">href</span> 1572<a name="s3.4.7" id="s3.4.7"></a><span class="item-no">3.4.7</span>&#160; Anti-spam &amp; <span class="term">href</span>
1573</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1573</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1574<br /> 1574<br />
1575&#160; htmLawed (function <span class="term">hl_tag()</span>) can check the <span class="term">href</span>&#160;attribute values (link addresses) as an anti-spam (email or link spam) measure.<br /> 1575&#160; htmLawed (function <span class="term">hl_tag()</span>) can check the <span class="term">href</span>&#160;attribute values (link addresses) as an anti-spam (email or link spam) measure.<br />
1576<br /> 1576<br />
1577&#160; If <span class="term">$config["anti_mail_spam"]</span>&#160;is not <span class="term">0</span>, the <span class="term">@</span>&#160;of email addresses in <span class="term">href</span>&#160;values like <span class="term">mailto&#58;a@b.com</span>&#160;is replaced with text specified by <span class="term">$config["anti_mail_spam"]</span>. The text should be of a form that makes it clear to others that the address needs to be edited before a mail is sent; e.g., <span class="term">&lt;remove_this_antispam&gt;@</span>&#160;(makes the example address <span class="term">a&lt;remove_this_antispam&gt;@b.com</span>).<br /> 1577&#160; If <span class="term">$config["anti_mail_spam"]</span>&#160;is not <span class="term">0</span>, the <span class="term">@</span>&#160;of email addresses in <span class="term">href</span>&#160;values like <span class="term">mailto&#58;a@b.com</span>&#160;is replaced with text specified by <span class="term">$config["anti_mail_spam"]</span>. The text should be of a form that makes it clear to others that the address needs to be edited before a mail is sent; e.g., <span class="term">&lt;remove_this_antispam&gt;@</span>&#160;(makes the example address <span class="term">a&lt;remove_this_antispam&gt;@b.com</span>).<br />
1578<br /> 1578<br />
1579&#160; For regular links, one can choose to have a <span class="term">rel</span>&#160;attribute with <span class="term">nofollow</span>&#160;in its value (which tells some search engines to not follow a link). This can discourage link spammers. Additionally, or as an alternative, one can choose to empty the <span class="term">href</span>&#160;value altogether (disable the link).<br /> 1579&#160; For regular links, one can choose to have a <span class="term">rel</span>&#160;attribute with <span class="term">nofollow</span>&#160;in its value (which tells some search engines to not follow a link). This can discourage link spammers. Additionally, or as an alternative, one can choose to empty the <span class="term">href</span>&#160;value altogether (disable the link).<br />
1580<br /> 1580<br />
1581&#160; For use of these options, <span class="term">$config["anti_link_spam"]</span>&#160;should be set as an array with values <span class="term">regex1</span>&#160;and <span class="term">regex2</span>, both or one of which can be empty (like <span class="term">array("", "regex2")</span>) to indicate that that option is not to be used. Otherwise, <span class="term">regex1</span>&#160;or <span class="term">regex2</span>&#160;should be PHP- and PCRE-compatible regular expression patterns: <span class="term">href</span>&#160;values will be matched against them and those matching the pattern will accordingly be treated.<br /> 1581&#160; For use of these options, <span class="term">$config["anti_link_spam"]</span>&#160;should be set as an array with values <span class="term">regex1</span>&#160;and <span class="term">regex2</span>, both or one of which can be empty (like <span class="term">array("", "regex2")</span>) to indicate that that option is not to be used. Otherwise, <span class="term">regex1</span>&#160;or <span class="term">regex2</span>&#160;should be PHP- and PCRE-compatible regular expression patterns: <span class="term">href</span>&#160;values will be matched against them and those matching the pattern will accordingly be treated.<br />
1582<br /> 1582<br />
1583&#160; Note that the regular expressions should have <em>delimiters</em>, and be well-formed and preferably fast. Absolute efficiency/accuracy is often not needed.<br /> 1583&#160; Note that the regular expressions should have <em>delimiters</em>, and be well-formed and preferably fast. Absolute efficiency/accuracy is often not needed.<br />
1584<br /> 1584<br />
1585&#160; An example, to have a <span class="term">rel</span>&#160;attribute with <span class="term">nofollow</span>&#160;for all links, and to disable links that do not point to domains <span class="term">abc.com</span>&#160;and <span class="term">xyz.org</span>:<br /> 1585&#160; An example, to have a <span class="term">rel</span>&#160;attribute with <span class="term">nofollow</span>&#160;for all links, and to disable links that do not point to domains <span class="term">abc.com</span>&#160;and <span class="term">xyz.org</span>:<br />
1586<br /> 1586<br />
1587 1587
1588<code class="code">&#160; &#160; $config["anti_link_spam"] = array(&#39;&#96;.&#96;&#39;, &#39;&#96;&#58;//\W&#42;(?!(abc\.com|xyz\.org))&#96;&#39;);</code> 1588<code class="code">&#160; &#160; $config["anti_link_spam"] = array(&#39;&#96;.&#96;&#39;, &#39;&#96;&#58;//\W&#42;(?!(abc\.com|xyz\.org))&#96;&#39;);</code>
1589<br /> 1589<br />
1590 1590
1591</div> 1591</div>
1592<div class="sub-sub-section"><h4> 1592<div class="sub-sub-section"><h4>
1593<a name="s3.4.8" id="s3.4.8"></a><span class="item-no">3.4.8</span>&#160; Inline style properties 1593<a name="s3.4.8" id="s3.4.8"></a><span class="item-no">3.4.8</span>&#160; Inline style properties
1594</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1594</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1595<br /> 1595<br />
1596&#160; htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the <span class="term">style</span>&#160;attributes. (CSS properties like <span class="term">background-image</span>&#160;that accept URLs in their values are noted in <a href="#s5.3">section 5.3</a>.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting <span class="term">$config["css_expression"]</span>&#160;to <span class="term">1</span>&#160;(default setting). Note that when <span class="term">$config["css_expression"]</span>&#160;is set to <span class="term">1</span>, htmLawed will remove <span class="term">/&#42;</span>&#160;from the <span class="term">style</span>&#160;values.<br /> 1596&#160; htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the <span class="term">style</span>&#160;attributes. (CSS properties like <span class="term">background-image</span>&#160;that accept URLs in their values are noted in <a href="#s5.3">section 5.3</a>.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting <span class="term">$config["css_expression"]</span>&#160;to <span class="term">1</span>&#160;(default setting). Note that when <span class="term">$config["css_expression"]</span>&#160;is set to <span class="term">1</span>, htmLawed will remove <span class="term">/&#42;</span>&#160;from the <span class="term">style</span>&#160;values.<br />
1597<br /> 1597<br />
1598&#160; <strong>Note</strong>: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the <span class="term">style</span>&#160;attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off (<span class="term">$config["schemes"] = "...style&#58;&#42;..."</span>, see <a href="#s3.4.3">section 3.4.3</a>, and <span class="term">$config["css_expression"] = 0</span>). Alternately, admins can use their own custom function for finer handling of <span class="term">style</span>&#160;values through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.9">section 3.4.9</a>).<br /> 1598&#160; <strong>Note</strong>: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the <span class="term">style</span>&#160;attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off (<span class="term">$config["schemes"] = "...style&#58;&#42;..."</span>, see <a href="#s3.4.3">section 3.4.3</a>, and <span class="term">$config["css_expression"] = 0</span>). Alternately, admins can use their own custom function for finer handling of <span class="term">style</span>&#160;values through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.9">section 3.4.9</a>).<br />
1599<br /> 1599<br />
1600&#160; It is also possible to have htmLawed let through any <span class="term">style</span>&#160;value by setting <span class="term">$config["style_pass"]</span>&#160;to <span class="term">1</span>.<br /> 1600&#160; It is also possible to have htmLawed let through any <span class="term">style</span>&#160;value by setting <span class="term">$config["style_pass"]</span>&#160;to <span class="term">1</span>.<br />
1601<br /> 1601<br />
1602&#160; As such, it is better to set up a CSS file with class declarations, disallow the <span class="term">style</span>&#160;attribute, set a <span class="term">$spec</span>&#160;rule (see <a href="#s2.3">section 2.3</a>) for <span class="term">class</span>&#160;for the <span class="term">oneof</span>&#160;or <span class="term">match</span>&#160;parameter, and ask writers to make use of the <span class="term">class</span>&#160;attribute.<br /> 1602&#160; As such, it is better to set up a CSS file with class declarations, disallow the <span class="term">style</span>&#160;attribute, set a <span class="term">$spec</span>&#160;rule (see <a href="#s2.3">section 2.3</a>) for <span class="term">class</span>&#160;for the <span class="term">oneof</span>&#160;or <span class="term">match</span>&#160;parameter, and ask writers to make use of the <span class="term">class</span>&#160;attribute.<br />
1603 1603
1604</div> 1604</div>
1605<div class="sub-sub-section"><h4> 1605<div class="sub-sub-section"><h4>
1606<a name="s3.4.9" id="s3.4.9"></a><span class="item-no">3.4.9</span>&#160; Hook function for tag content 1606<a name="s3.4.9" id="s3.4.9"></a><span class="item-no">3.4.9</span>&#160; Hook function for tag content
1607</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1607</h4><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1608<br /> 1608<br />
1609&#160; It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.).<br /> 1609&#160; It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.).<br />
1610<br /> 1610<br />
1611&#160; When <span class="term">$config</span>&#160;parameter <span class="term">hook_tag</span>&#160;is set to the name of a function, htmLawed (function <span class="term">hl_tag()</span>) will pass on the element name, and the <em>finalized</em>&#160;attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like <span class="term">&lt;element_name attribute_1_name="attribute_1_value"...&gt;</span>&#160;(for empty elements like <span class="term">img</span>&#160;and <span class="term">input</span>, the element-closing slash <span class="term">/</span>&#160;should also be included), etc.<br /> 1611&#160; When <span class="term">$config</span>&#160;parameter <span class="term">hook_tag</span>&#160;is set to the name of a function, htmLawed (function <span class="term">hl_tag()</span>) will pass on the element name, and the <em>finalized</em>&#160;attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like <span class="term">&lt;element_name attribute_1_name="attribute_1_value"...&gt;</span>&#160;(for empty elements like <span class="term">img</span>&#160;and <span class="term">input</span>, the element-closing slash <span class="term">/</span>&#160;should also be included), etc.<br />
1612<br /> 1612<br />
1613&#160; Any <span class="term">hook_tag</span>&#160;function, since htmLawed version 1.1.11, also receives names of elements in closing tags, such as <span class="term">a</span>&#160;in the closing <span class="term">&lt;/a&gt;</span>&#160;tag of the element <span class="term">&lt;a href="http&#58;//cnn.com"&gt;CNN&lt;/a&gt;</span>. No other value is passed to the function since a closing tag contains only element names. Typically, the function will return an empty string or a full closing tag (like <span class="term">&lt;/a&gt;</span>).<br /> 1613&#160; Any <span class="term">hook_tag</span>&#160;function, since htmLawed version 1.1.11, also receives names of elements in closing tags, such as <span class="term">a</span>&#160;in the closing <span class="term">&lt;/a&gt;</span>&#160;tag of the element <span class="term">&lt;a href="http&#58;//cnn.com"&gt;CNN&lt;/a&gt;</span>. No other value is passed to the function since a closing tag contains only element names. Typically, the function will return an empty string or a full closing tag (like <span class="term">&lt;/a&gt;</span>).<br />
1614<br /> 1614<br />
1615&#160; This is a <strong>powerful functionality</strong>&#160;that can be exploited for various objectives: consolidate-and-convert inline <span class="term">style</span>&#160;attributes to <span class="term">class</span>, convert <span class="term">embed</span>&#160;elements to <span class="term">object</span>, permit only one <span class="term">caption</span>&#160;element in a <span class="term">table</span>&#160;element, disallow embedding of certain types of media, <strong>inject HTML</strong>, use <a href="http://csstidy.sourceforge.net">CSSTidy</a>&#160;to sanitize <span class="term">style</span>&#160;attribute values, etc.<br /> 1615&#160; This is a <strong>powerful functionality</strong>&#160;that can be exploited for various objectives: consolidate-and-convert inline <span class="term">style</span>&#160;attributes to <span class="term">class</span>, convert <span class="term">embed</span>&#160;elements to <span class="term">object</span>, permit only one <span class="term">caption</span>&#160;element in a <span class="term">table</span>&#160;element, disallow embedding of certain types of media, <strong>inject HTML</strong>, use <a href="http://csstidy.sourceforge.net">CSSTidy</a>&#160;to sanitize <span class="term">style</span>&#160;attribute values, etc.<br />
1616<br /> 1616<br />
1617&#160; As an example, the custom hook code below can be used to force a series of specifically ordered <span class="term">id</span>&#160;attributes on all elements, and a specific <span class="term">param</span>&#160;element inside all <span class="term">object</span>&#160;elements:<br /> 1617&#160; As an example, the custom hook code below can be used to force a series of specifically ordered <span class="term">id</span>&#160;attributes on all elements, and a specific <span class="term">param</span>&#160;element inside all <span class="term">object</span>&#160;elements:<br />
1618<br /> 1618<br />
1619 1619
1620<code class="code">&#160; &#160; function my_tag_function($element, $attribute_array=0){</code> 1620<code class="code">&#160; &#160; function my_tag_function($element, $attribute_array=0){</code>
1621<br /> 1621<br />
1622<br /> 1622<br />
1623 1623
1624<code class="code">&#160; &#160; &#160; // If second argument is not received, it means a closing tag is being handled</code> 1624<code class="code">&#160; &#160; &#160; // If second argument is not received, it means a closing tag is being handled</code>
1625<br /> 1625<br />
1626 1626
1627<code class="code">&#160; &#160; &#160; if(is_numeric($attribute_array)){</code> 1627<code class="code">&#160; &#160; &#160; if(is_numeric($attribute_array)){</code>
1628<br /> 1628<br />
1629 1629
1630<code class="code">&#160; &#160; &#160; &#160; return "&lt;/$element&gt;";</code> 1630<code class="code">&#160; &#160; &#160; &#160; return "&lt;/$element&gt;";</code>
1631<br /> 1631<br />
1632 1632
1633<code class="code">&#160; &#160; &#160; }</code> 1633<code class="code">&#160; &#160; &#160; }</code>
1634<br /> 1634<br />
1635<br /> 1635<br />
1636 1636
1637<code class="code">&#160; &#160; &#160; static $id = 0;</code> 1637<code class="code">&#160; &#160; &#160; static $id = 0;</code>
1638<br /> 1638<br />
1639 1639
1640<code class="code">&#160; &#160; &#160; // Remove any duplicate element</code> 1640<code class="code">&#160; &#160; &#160; // Remove any duplicate element</code>
1641<br /> 1641<br />
1642 1642
1643<code class="code">&#160; &#160; &#160; if($element == &#39;param&#39; &amp;&amp; isset($attribute_array[&#39;allowscriptaccess&#39;])){</code> 1643<code class="code">&#160; &#160; &#160; if($element == &#39;param&#39; &amp;&amp; isset($attribute_array[&#39;allowscriptaccess&#39;])){</code>
1644<br /> 1644<br />
1645 1645
1646<code class="code">&#160; &#160; &#160; &#160; return &#39;&#39;;</code> 1646<code class="code">&#160; &#160; &#160; &#160; return &#39;&#39;;</code>
1647<br /> 1647<br />
1648 1648
1649<code class="code">&#160; &#160; &#160; }</code> 1649<code class="code">&#160; &#160; &#160; }</code>
1650<br /> 1650<br />
1651<br /> 1651<br />
1652 1652
1653<code class="code">&#160; &#160; &#160; $new_element = &#39;&#39;;</code> 1653<code class="code">&#160; &#160; &#160; $new_element = &#39;&#39;;</code>
1654<br /> 1654<br />
1655<br /> 1655<br />
1656 1656
1657<code class="code">&#160; &#160; &#160; // Force a serialized ID number</code> 1657<code class="code">&#160; &#160; &#160; // Force a serialized ID number</code>
1658<br /> 1658<br />
1659 1659
1660<code class="code">&#160; &#160; &#160; $attribute_array[&#39;id&#39;] = &#39;my_&#39;. $id;</code> 1660<code class="code">&#160; &#160; &#160; $attribute_array[&#39;id&#39;] = &#39;my_&#39;. $id;</code>
1661<br /> 1661<br />
1662 1662
1663<code class="code">&#160; &#160; &#160; ++$id;</code> 1663<code class="code">&#160; &#160; &#160; ++$id;</code>
1664<br /> 1664<br />
1665<br /> 1665<br />
1666 1666
1667<code class="code">&#160; &#160; &#160; // Inject param for allowscriptaccess</code> 1667<code class="code">&#160; &#160; &#160; // Inject param for allowscriptaccess</code>
1668<br /> 1668<br />
1669 1669
1670<code class="code">&#160; &#160; &#160; if($element == &#39;object&#39;){</code> 1670<code class="code">&#160; &#160; &#160; if($element == &#39;object&#39;){</code>
1671<br /> 1671<br />
1672 1672
1673<code class="code">&#160; &#160; &#160; &#160; $new_element = &#39;&lt;param id="my_&#39;. $id. &#39;"; allowscriptaccess="never" /&gt;&#39;;</code> 1673<code class="code">&#160; &#160; &#160; &#160; $new_element = &#39;&lt;param id="my_&#39;. $id. &#39;"; allowscriptaccess="never" /&gt;&#39;;</code>
1674<br /> 1674<br />
1675 1675
1676<code class="code">&#160; &#160; &#160; &#160; ++$id;</code> 1676<code class="code">&#160; &#160; &#160; &#160; ++$id;</code>
1677<br /> 1677<br />
1678 1678
1679<code class="code">&#160; &#160; &#160; }</code> 1679<code class="code">&#160; &#160; &#160; }</code>
1680<br /> 1680<br />
1681<br /> 1681<br />
1682 1682
1683<code class="code">&#160; &#160; &#160; $string = &#39;&#39;;</code> 1683<code class="code">&#160; &#160; &#160; $string = &#39;&#39;;</code>
1684<br /> 1684<br />
1685 1685
1686<code class="code">&#160; &#160; &#160; foreach($attribute_array as $k=&gt;$v){</code> 1686<code class="code">&#160; &#160; &#160; foreach($attribute_array as $k=&gt;$v){</code>
1687<br /> 1687<br />
1688 1688
1689<code class="code">&#160; &#160; &#160; &#160; $string .= " {$k}=\"{$v}\"";</code> 1689<code class="code">&#160; &#160; &#160; &#160; $string .= " {$k}=\"{$v}\"";</code>
1690<br /> 1690<br />
1691 1691
1692<code class="code">&#160; &#160; &#160; }</code> 1692<code class="code">&#160; &#160; &#160; }</code>
1693<br /> 1693<br />
1694<br /> 1694<br />
1695 1695
1696<code class="code">&#160; &#160; &#160; static $empty_elements = array(&#39;area&#39;=&gt;1, &#39;br&#39;=&gt;1, &#39;col&#39;=&gt;1, &#39;command&#39;=&gt;1, &#39;embed&#39;=&gt;1, &#39;hr&#39;=&gt;1, &#39;img&#39;=&gt;1, &#39;input&#39;=&gt;1, &#39;isindex&#39;=&gt;1, &#39;keygen&#39;=&gt;1, &#39;link&#39;=&gt;1, &#39;meta&#39;=&gt;1, &#39;param&#39;=&gt;1, &#39;source&#39;=&gt;1, &#39;track&#39;=&gt;1, &#39;wbr&#39;=&gt;1);</code> 1696<code class="code">&#160; &#160; &#160; static $empty_elements = array(&#39;area&#39;=&gt;1, &#39;br&#39;=&gt;1, &#39;col&#39;=&gt;1, &#39;command&#39;=&gt;1, &#39;embed&#39;=&gt;1, &#39;hr&#39;=&gt;1, &#39;img&#39;=&gt;1, &#39;input&#39;=&gt;1, &#39;isindex&#39;=&gt;1, &#39;keygen&#39;=&gt;1, &#39;link&#39;=&gt;1, &#39;meta&#39;=&gt;1, &#39;param&#39;=&gt;1, &#39;source&#39;=&gt;1, &#39;track&#39;=&gt;1, &#39;wbr&#39;=&gt;1);</code>
1697<br /> 1697<br />
1698<br /> 1698<br />
1699 1699
1700<code class="code">&#160; &#160; &#160; return "&lt;{$element}{$string}". (array_key_exists($element, $empty_elements) ? &#39; /&#39; &#58; &#39;&#39;). &#39;&gt;&#39;. $new_element;</code> 1700<code class="code">&#160; &#160; &#160; return "&lt;{$element}{$string}". (array_key_exists($element, $empty_elements) ? &#39; /&#39; &#58; &#39;&#39;). &#39;&gt;&#39;. $new_element;</code>
1701<br /> 1701<br />
1702 1702
1703<code class="code">&#160; &#160; }</code> 1703<code class="code">&#160; &#160; }</code>
1704<br /> 1704<br />
1705<br /> 1705<br />
1706&#160; The <span class="term">hook_tag</span>&#160;parameter is different from the <span class="term">hook</span>&#160;parameter (<a href="#s3.7">section 3.7</a>).<br /> 1706&#160; The <span class="term">hook_tag</span>&#160;parameter is different from the <span class="term">hook</span>&#160;parameter (<a href="#s3.7">section 3.7</a>).<br />
1707<br /> 1707<br />
1708&#160; Snippets of hook function code developed by others may be available on the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">htmLawed</a>&#160;website.<br /> 1708&#160; Snippets of hook function code developed by others may be available on the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">htmLawed</a>&#160;website.<br />
1709 1709
1710</div> 1710</div>
1711<div class="sub-section"><h3> 1711<div class="sub-section"><h3>
1712<a name="s3.5" id="s3.5"></a><span class="item-no">3.5</span>&#160; Simple configuration directive for most valid XHTML 1712<a name="s3.5" id="s3.5"></a><span class="item-no">3.5</span>&#160; Simple configuration directive for most valid XHTML
1713</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1713</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1714<br /> 1714<br />
1715&#160; If <span class="term">$config["valid_xhtml"]</span>&#160;is set to <span class="term">1</span>, some relevant <span class="term">$config</span>&#160;parameters (indicated by <span class="term">~</span>&#160;in <a href="#s2.2">section 2.2</a>) are auto-adjusted. This allows one to pass the <span class="term">$config</span>&#160;argument with a simpler value. If a value for a parameter auto-set through <span class="term">valid_xhtml</span>&#160;is still manually provided, then that value will over-ride the auto-set value.<br /> 1715&#160; If <span class="term">$config["valid_xhtml"]</span>&#160;is set to <span class="term">1</span>, some relevant <span class="term">$config</span>&#160;parameters (indicated by <span class="term">~</span>&#160;in <a href="#s2.2">section 2.2</a>) are auto-adjusted. This allows one to pass the <span class="term">$config</span>&#160;argument with a simpler value. If a value for a parameter auto-set through <span class="term">valid_xhtml</span>&#160;is still manually provided, then that value will over-ride the auto-set value.<br />
1716 1716
1717</div> 1717</div>
1718<div class="sub-section"><h3> 1718<div class="sub-section"><h3>
1719<a name="s3.6" id="s3.6"></a><span class="item-no">3.6</span>&#160; Simple configuration directive for most <em>safe</em>&#160;HTML 1719<a name="s3.6" id="s3.6"></a><span class="item-no">3.6</span>&#160; Simple configuration directive for most <em>safe</em>&#160;HTML
1720</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1720</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1721<br /> 1721<br />
1722&#160; <em>Safe</em>&#160;HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specifications. When elements such as <span class="term">script</span>&#160;and <span class="term">object</span>, and attributes such as <span class="term">onmouseover</span>&#160;and <span class="term">style</span>&#160;are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered <span class="term">safe</span>&#160;depends on the nature of the web application and the trust-level accorded to its users.<br /> 1722&#160; <em>Safe</em>&#160;HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specifications. When elements such as <span class="term">script</span>&#160;and <span class="term">object</span>, and attributes such as <span class="term">onmouseover</span>&#160;and <span class="term">style</span>&#160;are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered <span class="term">safe</span>&#160;depends on the nature of the web application and the trust-level accorded to its users.<br />
1723<br /> 1723<br />
1724&#160; htmLawed allows an admin to use <span class="term">$config["safe"]</span>&#160;to auto-adjust multiple <span class="term">$config</span>&#160;parameters (such as <span class="term">elements</span>&#160;which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by <span class="term">"</span>&#160;in <a href="#s2.2">section 2.2</a>). Thus, one can pass the <span class="term">$config</span>&#160;argument with a simpler value. Having the <span class="term">safe</span>&#160;parameter set to <span class="term">1</span>&#160;is equivalent to setting the following <span class="term">$config</span>&#160;parameters to the noted values :<br /> 1724&#160; htmLawed allows an admin to use <span class="term">$config["safe"]</span>&#160;to auto-adjust multiple <span class="term">$config</span>&#160;parameters (such as <span class="term">elements</span>&#160;which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by <span class="term">"</span>&#160;in <a href="#s2.2">section 2.2</a>). Thus, one can pass the <span class="term">$config</span>&#160;argument with a simpler value. Having the <span class="term">safe</span>&#160;parameter set to <span class="term">1</span>&#160;is equivalent to setting the following <span class="term">$config</span>&#160;parameters to the noted values :<br />
1725<br /> 1725<br />
1726 1726
1727<code class="code">&#160; &#160; cdata - 0</code> 1727<code class="code">&#160; &#160; cdata - 0</code>
1728<br /> 1728<br />
1729 1729
1730<code class="code">&#160; &#160; comment - 0</code> 1730<code class="code">&#160; &#160; comment - 0</code>
1731<br /> 1731<br />
1732 1732
1733<code class="code">&#160; &#160; deny_attribute - on&#42;</code> 1733<code class="code">&#160; &#160; deny_attribute - on&#42;</code>
1734<br /> 1734<br />
1735 1735
1736<code class="code">&#160; &#160; elements - &#42; -applet -audio -canvas -embed -iframe -object -script -video</code> 1736<code class="code">&#160; &#160; elements - &#42; -applet -audio -canvas -embed -iframe -object -script -video</code>
1737<br /> 1737<br />
1738 1738
1739<code class="code">&#160; &#160; schemes - href&#58; aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style&#58; !; &#42;&#58;file, http, https</code> 1739<code class="code">&#160; &#160; schemes - href&#58; aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style&#58; !; &#42;&#58;file, http, https</code>
1740<br /> 1740<br />
1741<br /> 1741<br />
1742&#160; With <span class="term">safe</span>&#160;set to <span class="term">1</span>, htmLawed considers <span class="term">CDATA</span>&#160;sections and HTML comments as plain text, and prohibits the <span class="term">applet</span>, <span class="term">audio</span>, <span class="term">canvas</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">object</span>, <span class="term">script</span>&#160;and <span class="term">video</span>&#160;elements, and the <span class="term">on&#42;</span>&#160;attributes like <span class="term">onclick</span>. ( There are <span class="term">$config</span>&#160;parameters like <span class="term">css_expression</span>&#160;that are not affected by the value set for <span class="term">safe</span>&#160;but whose default values still contribute towards a more <em>safe</em>&#160;output.) Further, unless overridden by the value for parameter <span class="term">schemes</span>&#160;(see <a href="#s3.4.3">section 3.4.3</a>), the schemes <span class="term">app</span>, <span class="term">data</span>&#160;and <span class="term">javascript</span>&#160;are not permitted, and URLs with schemes are neutralized so that, e.g., <span class="term">style="moz-binding&#58;url(http&#58;//danger)"</span>&#160;becomes <span class="term">style="moz-binding&#58;url(denied&#58;http&#58;//danger)"</span>.<br /> 1742&#160; With <span class="term">safe</span>&#160;set to <span class="term">1</span>, htmLawed considers <span class="term">CDATA</span>&#160;sections and HTML comments as plain text, and prohibits the <span class="term">applet</span>, <span class="term">audio</span>, <span class="term">canvas</span>, <span class="term">embed</span>, <span class="term">iframe</span>, <span class="term">object</span>, <span class="term">script</span>&#160;and <span class="term">video</span>&#160;elements, and the <span class="term">on&#42;</span>&#160;attributes like <span class="term">onclick</span>. ( There are <span class="term">$config</span>&#160;parameters like <span class="term">css_expression</span>&#160;that are not affected by the value set for <span class="term">safe</span>&#160;but whose default values still contribute towards a more <em>safe</em>&#160;output.) Further, unless overridden by the value for parameter <span class="term">schemes</span>&#160;(see <a href="#s3.4.3">section 3.4.3</a>), the schemes <span class="term">app</span>, <span class="term">data</span>&#160;and <span class="term">javascript</span>&#160;are not permitted, and URLs with schemes are neutralized so that, e.g., <span class="term">style="moz-binding&#58;url(http&#58;//danger)"</span>&#160;becomes <span class="term">style="moz-binding&#58;url(denied&#58;http&#58;//danger)"</span>.<br />
1743<br /> 1743<br />
1744&#160; Admins, however, may still want to completely deny the <span class="term">style</span>&#160;attribute, e.g., with code like<br /> 1744&#160; Admins, however, may still want to completely deny the <span class="term">style</span>&#160;attribute, e.g., with code like<br />
1745<br /> 1745<br />
1746 1746
1747<code class="code">&#160; &#160; $processed = htmLawed($text, array(&#39;safe&#39;=&gt;1, &#39;deny_attribute&#39;=&gt;&#39;style&#39;));</code> 1747<code class="code">&#160; &#160; $processed = htmLawed($text, array(&#39;safe&#39;=&gt;1, &#39;deny_attribute&#39;=&gt;&#39;style&#39;));</code>
1748<br /> 1748<br />
1749<br /> 1749<br />
1750&#160; Permitting the <span class="term">style</span>&#160;attribute brings in risks of <em>click-jacking</em>, etc. CSS property values can render a page non-functional or be used to deface it. Except for URLs, dynamic expressions, and some other things, htmLawed does not completely check <span class="term">style</span>&#160;values. It does provide ways for the code-developer implementing htmLawed to do such checks through the <span class="term">$spec</span>&#160;argument, and through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.8">section 3.4.8</a>&#160;for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended.<br /> 1750&#160; Permitting the <span class="term">style</span>&#160;attribute brings in risks of <em>click-jacking</em>, etc. CSS property values can render a page non-functional or be used to deface it. Except for URLs, dynamic expressions, and some other things, htmLawed does not completely check <span class="term">style</span>&#160;values. It does provide ways for the code-developer implementing htmLawed to do such checks through the <span class="term">$spec</span>&#160;argument, and through the <span class="term">hook_tag</span>&#160;parameter (see <a href="#s3.4.8">section 3.4.8</a>&#160;for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended.<br />
1751<br /> 1751<br />
1752&#160; If a value for a parameter auto-set through <span class="term">safe</span>&#160;is still manually provided, then that value can over-ride the auto-set value. E.g., with <span class="term">$config["safe"] = 1</span>&#160;and <span class="term">$config["elements"] = "&#42; +script"</span>, <span class="term">script</span>, but not <span class="term">applet</span>, is allowed. Such over-ride does not occur for <span class="term">deny_attribute</span>&#160;(for legacy reason) when comma-separated attribute names are provided as the value for this parameter (<a href="#s3.4">section 3.4</a>); instead htmLawed will add <span class="term">on&#42;</span>&#160;to the value provided for <span class="term">deny_attribute</span>.<br /> 1752&#160; If a value for a parameter auto-set through <span class="term">safe</span>&#160;is still manually provided, then that value can over-ride the auto-set value. E.g., with <span class="term">$config["safe"] = 1</span>&#160;and <span class="term">$config["elements"] = "&#42; +script"</span>, <span class="term">script</span>, but not <span class="term">applet</span>, is allowed. Such over-ride does not occur for <span class="term">deny_attribute</span>&#160;(for legacy reason) when comma-separated attribute names are provided as the value for this parameter (<a href="#s3.4">section 3.4</a>); instead htmLawed will add <span class="term">on&#42;</span>&#160;to the value provided for <span class="term">deny_attribute</span>.<br />
1753<br /> 1753<br />
1754&#160; A page illustrating the efficacy of htmLawed's anti-XSS abilities with <span class="term">safe</span>&#160;set to <span class="term">1</span>&#160;against XSS vectors listed by <a href="http://ha.ckers.org/xss.html">RSnake</a>&#160;may be available <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm">here</a>.<br /> 1754&#160; A page illustrating the efficacy of htmLawed's anti-XSS abilities with <span class="term">safe</span>&#160;set to <span class="term">1</span>&#160;against XSS vectors listed by <a href="http://ha.ckers.org/xss.html">RSnake</a>&#160;may be available <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm">here</a>.<br />
1755 1755
1756</div> 1756</div>
1757<div class="sub-section"><h3> 1757<div class="sub-section"><h3>
1758<a name="s3.7" id="s3.7"></a><span class="item-no">3.7</span>&#160; Using a hook function 1758<a name="s3.7" id="s3.7"></a><span class="item-no">3.7</span>&#160; Using a hook function
1759</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1759</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1760<br /> 1760<br />
1761&#160; If <span class="term">$config["hook"]</span>&#160;is not set to <span class="term">0</span>, then htmLawed will allow preliminarily processed input to be altered by a hook function named by <span class="term">$config["hook"]</span>&#160;before starting the main work (but after handling of characters, entities, HTML comments and <span class="term">CDATA</span>&#160;sections -- see code for function <span class="term">htmLawed()</span>).<br /> 1761&#160; If <span class="term">$config["hook"]</span>&#160;is not set to <span class="term">0</span>, then htmLawed will allow preliminarily processed input to be altered by a hook function named by <span class="term">$config["hook"]</span>&#160;before starting the main work (but after handling of characters, entities, HTML comments and <span class="term">CDATA</span>&#160;sections -- see code for function <span class="term">htmLawed()</span>).<br />
1762<br /> 1762<br />
1763&#160; The hook function also allows one to alter the <em>finalized</em>&#160;values of <span class="term">$config</span>&#160;and <span class="term">$spec</span>.<br /> 1763&#160; The hook function also allows one to alter the <em>finalized</em>&#160;values of <span class="term">$config</span>&#160;and <span class="term">$spec</span>.<br />
1764<br /> 1764<br />
1765&#160; Note that the <span class="term">hook</span>&#160;parameter is different from the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>).<br /> 1765&#160; Note that the <span class="term">hook</span>&#160;parameter is different from the <span class="term">hook_tag</span>&#160;parameter (<a href="#s3.4.9">section 3.4.9</a>).<br />
1766<br /> 1766<br />
1767&#160; Snippets of hook function code developed by others may be available on the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">htmLawed</a>&#160;website.<br /> 1767&#160; Snippets of hook function code developed by others may be available on the <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">htmLawed</a>&#160;website.<br />
1768 1768
1769</div> 1769</div>
1770<div class="sub-section"><h3> 1770<div class="sub-section"><h3>
1771<a name="s3.8" id="s3.8"></a><span class="item-no">3.8</span>&#160; Obtaining <em>finalized</em>&#160;parameter values 1771<a name="s3.8" id="s3.8"></a><span class="item-no">3.8</span>&#160; Obtaining <em>finalized</em>&#160;parameter values
1772</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1772</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1773<br /> 1773<br />
1774&#160; htmLawed can assign the <em>finalized</em>&#160;<span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;values to a variable named by <span class="term">$config["show_setting"]</span>. The variable, made global by htmLawed, is set as an array with three keys: <span class="term">config</span>, with the <span class="term">$config</span>&#160;value, <span class="term">spec</span>, with the <span class="term">$spec</span>&#160;value, and <span class="term">time</span>, with a value that is the Unix time (the output of PHP's <span class="term">microtime()</span>&#160;function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code.<br /> 1774&#160; htmLawed can assign the <em>finalized</em>&#160;<span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;values to a variable named by <span class="term">$config["show_setting"]</span>. The variable, made global by htmLawed, is set as an array with three keys: <span class="term">config</span>, with the <span class="term">$config</span>&#160;value, <span class="term">spec</span>, with the <span class="term">$spec</span>&#160;value, and <span class="term">time</span>, with a value that is the Unix time (the output of PHP's <span class="term">microtime()</span>&#160;function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code.<br />
1775<br /> 1775<br />
1776&#160; The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers.<br /> 1776&#160; The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers.<br />
1777 1777
1778</div> 1778</div>
1779<div class="sub-section"><h3> 1779<div class="sub-section"><h3>
1780<a name="s3.9" id="s3.9"></a><span class="item-no">3.9</span>&#160; Retaining non-HTML tags in input with mixed markup 1780<a name="s3.9" id="s3.9"></a><span class="item-no">3.9</span>&#160; Retaining non-HTML tags in input with mixed markup
1781</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1781</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1782<br /> 1782<br />
1783&#160; htmLawed does not remove certain characters that, though invalid, are nevertheless <em>discouraged</em>&#160;in HTML documents as per the specifications (see <a href="#s5.1">section 5.1</a>). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).<br /> 1783&#160; htmLawed does not remove certain characters that, though invalid, are nevertheless <em>discouraged</em>&#160;in HTML documents as per the specifications (see <a href="#s5.1">section 5.1</a>). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).<br />
1784<br /> 1784<br />
1785&#160; To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters with some of the HTML-discouraged characters (see <a href="#s3.1.2">section 3.1.2</a>). Post-htmLawed processing, the replacements are reverted.<br /> 1785&#160; To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters with some of the HTML-discouraged characters (see <a href="#s3.1.2">section 3.1.2</a>). Post-htmLawed processing, the replacements are reverted.<br />
1786<br /> 1786<br />
1787&#160; An example (mixed HTML and PHP code in input text):<br /> 1787&#160; An example (mixed HTML and PHP code in input text):<br />
1788<br /> 1788<br />
1789 1789
1790<code class="code">&#160; &#160; $text = preg_replace(&#39;&#96;&lt;\?php(.+?)\?&gt;&#96;sm&#39;, "\x83?php\\1?\x84", $text);</code> 1790<code class="code">&#160; &#160; $text = preg_replace(&#39;&#96;&lt;\?php(.+?)\?&gt;&#96;sm&#39;, "\x83?php\\1?\x84", $text);</code>
1791<br /> 1791<br />
1792 1792
1793<code class="code">&#160; &#160; $processed = htmLawed($text);</code> 1793<code class="code">&#160; &#160; $processed = htmLawed($text);</code>
1794<br /> 1794<br />
1795 1795
1796<code class="code">&#160; &#160; $processed = preg_replace(&#39;&#96;\x83\?php(.+?)\?\x84&#96;sm&#39;, &#39;&lt;?php$1?&gt;&#39;, $processed);</code> 1796<code class="code">&#160; &#160; $processed = preg_replace(&#39;&#96;\x83\?php(.+?)\?\x84&#96;sm&#39;, &#39;&lt;?php$1?&gt;&#39;, $processed);</code>
1797<br /> 1797<br />
1798<br /> 1798<br />
1799&#160; This code will not work if <span class="term">$config["clean_ms_char"]</span>&#160;is set to <span class="term">1</span>&#160;(<a href="#s3.1">section 3.1</a>), in which case one should instead deploy a hook function (<a href="#s3.7">section 3.7</a>). (htmLawed internally uses certain control characters, code-points <span class="term">1</span>&#160;to <span class="term">7</span>, and use of these characters as markers in the logic of hook functions may cause issues.)<br /> 1799&#160; This code will not work if <span class="term">$config["clean_ms_char"]</span>&#160;is set to <span class="term">1</span>&#160;(<a href="#s3.1">section 3.1</a>), in which case one should instead deploy a hook function (<a href="#s3.7">section 3.7</a>). (htmLawed internally uses certain control characters, code-points <span class="term">1</span>&#160;to <span class="term">7</span>, and use of these characters as markers in the logic of hook functions may cause issues.)<br />
1800<br /> 1800<br />
1801&#160; Admins may also be able to use <span class="term">$config["and_mark"]</span>&#160;to deal with such mixed markup; see <a href="#s3.2">section 3.2</a>.<br /> 1801&#160; Admins may also be able to use <span class="term">$config["and_mark"]</span>&#160;to deal with such mixed markup; see <a href="#s3.2">section 3.2</a>.<br />
1802 1802
1803</div> 1803</div>
1804</div> 1804</div>
1805<div class="section"><h2> 1805<div class="section"><h2>
1806<a name="s4" id="s4"></a><span class="item-no">4</span>&#160; Other 1806<a name="s4" id="s4"></a><span class="item-no">4</span>&#160; Other
1807</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1807</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1808<div class="sub-section"><h3> 1808<div class="sub-section"><h3>
1809<a name="s4.1" id="s4.1"></a><span class="item-no">4.1</span>&#160; Support 1809<a name="s4.1" id="s4.1"></a><span class="item-no">4.1</span>&#160; Support
1810</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1810</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1811<br /> 1811<br />
1812&#160; Software updates and forum-based community-support may be found at <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a>. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at <a href="http://php.net">http://php.net</a>.<br /> 1812&#160; Software updates and forum-based community-support may be found at <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed</a>. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at <a href="http://php.net">http://php.net</a>.<br />
1813 1813
1814</div> 1814</div>
1815<div class="sub-section"><h3> 1815<div class="sub-section"><h3>
1816<a name="s4.2" id="s4.2"></a><span class="item-no">4.2</span>&#160; Known issues 1816<a name="s4.2" id="s4.2"></a><span class="item-no">4.2</span>&#160; Known issues
1817</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1817</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1818<br /> 1818<br />
1819&#160; See <a href="#s2.8">section 2.8</a>.<br /> 1819&#160; See <a href="#s2.8">section 2.8</a>.<br />
1820 1820
1821</div> 1821</div>
1822<div class="sub-section"><h3> 1822<div class="sub-section"><h3>
1823<a name="s4.3" id="s4.3"></a><span class="item-no">4.3</span>&#160; Change-log 1823<a name="s4.3" id="s4.3"></a><span class="item-no">4.3</span>&#160; Change-log
1824</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1824</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1825<br /> 1825<br />
1826&#160; (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the <span class="term">htmLawed.php</span>&#160;file, may be updated without a change-log entry if the secondary files, but not htmLawed per se, are revised.)<br /> 1826&#160; (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the <span class="term">htmLawed.php</span>&#160;file, may be updated without a change-log entry if the secondary files, but not htmLawed per se, are revised.)<br />
1827<br /> 1827<br />
1828&#160; <em>Version number - Release date. Notes</em><br /> 1828&#160; <em>Version number - Release date. Notes</em><br />
1829<br /> 1829<br />
1830&#160; 1.2.5 - 24 September 2019. Fixes two bugs in <span class="term">font</span>&#160;tag transformation<br /> 1830&#160; 1.2.5 - 24 September 2019. Fixes two bugs in <span class="term">font</span>&#160;tag transformation<br />
1831<br /> 1831<br />
1832&#160; 1.2.4.2 - 16 May 2019. Corrects a PHP notice if a semi-colon is present in <span class="term">$config["schemes"]</span><br /> 1832&#160; 1.2.4.2 - 16 May 2019. Corrects a PHP notice if a semi-colon is present in <span class="term">$config["schemes"]</span><br />
1833<br /> 1833<br />
1834&#160; 1.2.4.1 - 12 September 2017. Corrects a function re-declaration bug introduced in version 1.2.4<br /> 1834&#160; 1.2.4.1 - 12 September 2017. Corrects a function re-declaration bug introduced in version 1.2.4<br />
1835<br /> 1835<br />
1836&#160; 1.2.4 - 31 August 2017. Removes use of PHP <span class="term">create_function</span>&#160;function and <span class="term">$php_errormsg</span>&#160;reserved variable (deprecated in PHP 7.2)<br /> 1836&#160; 1.2.4 - 31 August 2017. Removes use of PHP <span class="term">create_function</span>&#160;function and <span class="term">$php_errormsg</span>&#160;reserved variable (deprecated in PHP 7.2)<br />
1837<br /> 1837<br />
1838&#160; 1.2.3 - 5 July 2017. New option value of <span class="term">4</span>&#160;for <span class="term">$config["comments"]</span>&#160;to stop enforcing a space character before the <span class="term">--&gt;</span>&#160;comment-closing marker<br /> 1838&#160; 1.2.3 - 5 July 2017. New option value of <span class="term">4</span>&#160;for <span class="term">$config["comments"]</span>&#160;to stop enforcing a space character before the <span class="term">--&gt;</span>&#160;comment-closing marker<br />
1839<br /> 1839<br />
1840&#160; 1.2.2 - 25 May 2017. Fix for a bug in parsing <span class="term">$spec</span>&#160;that got introduced in version 1.2; also, <span class="term">$spec</span>&#160;is now parsed to accommodate specifications for an HTML element when they are specified in multiple rules<br /> 1840&#160; 1.2.2 - 25 May 2017. Fix for a bug in parsing <span class="term">$spec</span>&#160;that got introduced in version 1.2; also, <span class="term">$spec</span>&#160;is now parsed to accommodate specifications for an HTML element when they are specified in multiple rules<br />
1841<br /> 1841<br />
1842&#160; 1.2.1.1 - 17 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes<br /> 1842&#160; 1.2.1.1 - 17 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes<br />
1843<br /> 1843<br />
1844&#160; 1.2.1 - 15 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes<br /> 1844&#160; 1.2.1 - 15 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes<br />
1845<br /> 1845<br />
1846&#160; 1.2 - 11 February 2017. (First beta release on 26 May 2013). Added support for HTML version 5; ARIA, data-* and microdata attributes; <span class="term">app</span>, <span class="term">data</span>, <span class="term">javascript</span>&#160;and <span class="term">tel</span>&#160;URL schemes (thus, <span class="term">javascript&#58;</span>&#160;is not filtered in default mode). Removed support for code using Kses functions (see <a href="#s2.6">section 2.6</a>). Changes in revisions to the beta releases are not noted here.<br /> 1846&#160; 1.2 - 11 February 2017. (First beta release on 26 May 2013). Added support for HTML version 5; ARIA, data-* and microdata attributes; <span class="term">app</span>, <span class="term">data</span>, <span class="term">javascript</span>&#160;and <span class="term">tel</span>&#160;URL schemes (thus, <span class="term">javascript&#58;</span>&#160;is not filtered in default mode). Removed support for code using Kses functions (see <a href="#s2.6">section 2.6</a>). Changes in revisions to the beta releases are not noted here.<br />
1847<br /> 1847<br />
1848&#160; 1.1.22 - 5 March 2016. Improved testing of attribute value rules specified in <span class="term">$spec</span><br /> 1848&#160; 1.1.22 - 5 March 2016. Improved testing of attribute value rules specified in <span class="term">$spec</span><br />
1849<br /> 1849<br />
1850&#160; 1.1.21 - 27 February 2016. Improvement and security fix in transforming <span class="term">font</span>&#160;element<br /> 1850&#160; 1.1.21 - 27 February 2016. Improvement and security fix in transforming <span class="term">font</span>&#160;element<br />
1851<br /> 1851<br />
1852&#160; 1.1.20 - 9 June 2015. Fix for a potential security vulnerability arising from unescaped double-quote character in single-quoted attribute value of some deprecated elements when tag transformation is enabled; recognition for non-(HTML 4) standard <span class="term">allowfullscreen</span>&#160;attribute of <span class="term">iframe</span><br /> 1852&#160; 1.1.20 - 9 June 2015. Fix for a potential security vulnerability arising from unescaped double-quote character in single-quoted attribute value of some deprecated elements when tag transformation is enabled; recognition for non-(HTML 4) standard <span class="term">allowfullscreen</span>&#160;attribute of <span class="term">iframe</span><br />
1853<br /> 1853<br />
1854&#160; 1.1.19 - 19 January 2015. Fix for a bug in cleaning of soft-hyphens in URL values, etc<br /> 1854&#160; 1.1.19 - 19 January 2015. Fix for a bug in cleaning of soft-hyphens in URL values, etc<br />
1855<br /> 1855<br />
1856&#160; 1.1.18 - 2 August 2014. Fix for a potential security vulnerability arising from specially encoded text with serial opening tags<br /> 1856&#160; 1.1.18 - 2 August 2014. Fix for a potential security vulnerability arising from specially encoded text with serial opening tags<br />
1857<br /> 1857<br />
1858&#160; 1.1.17 - 11 March 2014. Removed use of PHP function preg_replace with <span class="term">e</span>&#160;modifier for compatibility with PHP 5.5.<br /> 1858&#160; 1.1.17 - 11 March 2014. Removed use of PHP function preg_replace with <span class="term">e</span>&#160;modifier for compatibility with PHP 5.5.<br />
1859<br /> 1859<br />
1860&#160; 1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specialy encoded space characters in URL schemes/protocols<br /> 1860&#160; 1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specialy encoded space characters in URL schemes/protocols<br />
1861<br /> 1861<br />
1862&#160; 1.1.15 - 11 August 2013. Improved tidying/prettifying functionality<br /> 1862&#160; 1.1.15 - 11 August 2013. Improved tidying/prettifying functionality<br />
1863<br /> 1863<br />
1864&#160; 1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during <span class="term">tidying</span>&#160;when <span class="term">balance</span>&#160;is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like <span class="term">blockquote</span><br /> 1864&#160; 1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during <span class="term">tidying</span>&#160;when <span class="term">balance</span>&#160;is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like <span class="term">blockquote</span><br />
1865<br /> 1865<br />
1866&#160; 1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes<br /> 1866&#160; 1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes<br />
1867<br /> 1867<br />
1868&#160; 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the <span class="term">face</span>&#160;attribute<br /> 1868&#160; 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the <span class="term">face</span>&#160;attribute<br />
1869<br /> 1869<br />
1870&#160; 1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. <span class="term">$config["hook_tag"]</span>, if specified, now receives names of elements in closing tags.<br /> 1870&#160; 1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. <span class="term">$config["hook_tag"]</span>, if specified, now receives names of elements in closing tags.<br />
1871<br /> 1871<br />
1872&#160; 1.1.10 - 22 October 2011. Fix for a bug in the <span class="term">tidy</span>&#160;functionality that caused the entire input to be replaced with a single space; new parameter, <span class="term">$config["direct_list_nest"]</span>&#160;to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)<br /> 1872&#160; 1.1.10 - 22 October 2011. Fix for a bug in the <span class="term">tidy</span>&#160;functionality that caused the entire input to be replaced with a single space; new parameter, <span class="term">$config["direct_list_nest"]</span>&#160;to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)<br />
1873<br /> 1873<br />
1874&#160; 1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of <span class="term">li</span>&#160;within <span class="term">dir</span><br /> 1874&#160; 1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of <span class="term">li</span>&#160;within <span class="term">dir</span><br />
1875<br /> 1875<br />
1876&#160; 1.1.9.4 - 3 July 2010. Parameter <span class="term">schemes</span>&#160;now accepts <span class="term">!</span>&#160;so any URL, even a local one, can be <em>denied</em>. An issue in which a second URL value in <span class="term">style</span>&#160;properties was not checked was fixed.<br /> 1876&#160; 1.1.9.4 - 3 July 2010. Parameter <span class="term">schemes</span>&#160;now accepts <span class="term">!</span>&#160;so any URL, even a local one, can be <em>denied</em>. An issue in which a second URL value in <span class="term">style</span>&#160;properties was not checked was fixed.<br />
1877<br /> 1877<br />
1878&#160; 1.1.9.3 - 17 May 2010. Checks for correct nesting of <span class="term">param</span><br /> 1878&#160; 1.1.9.3 - 17 May 2010. Checks for correct nesting of <span class="term">param</span><br />
1879<br /> 1879<br />
1880&#160; 1.1.9.2 - 26 April 2010. Minor fix regarding rendering of denied URL schemes<br /> 1880&#160; 1.1.9.2 - 26 April 2010. Minor fix regarding rendering of denied URL schemes<br />
1881<br /> 1881<br />
1882&#160; 1.1.9.1 - 26 February 2010. htmLawed now uses the LGPL version 3 license; support for <span class="term">flashvars</span>&#160;attribute for <span class="term">embed</span><br /> 1882&#160; 1.1.9.1 - 26 February 2010. htmLawed now uses the LGPL version 3 license; support for <span class="term">flashvars</span>&#160;attribute for <span class="term">embed</span><br />
1883<br /> 1883<br />
1884&#160; 1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values<br /> 1884&#160; 1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values<br />
1885<br /> 1885<br />
1886&#160; 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice<br /> 1886&#160; 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice<br />
1887<br /> 1887<br />
1888&#160; 1.1.8 - 23 April 2009. Parameter <span class="term">deny_attribute</span>&#160;now accepts the wild-card <span class="term">&#42;</span>, making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting <span class="term">$spec</span><br /> 1888&#160; 1.1.8 - 23 April 2009. Parameter <span class="term">deny_attribute</span>&#160;now accepts the wild-card <span class="term">&#42;</span>, making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting <span class="term">$spec</span><br />
1889<br /> 1889<br />
1890&#160; 1.1.7 - 11-12 March 2009. Attributes globally denied through <span class="term">deny_attribute</span>&#160;can be allowed element-specifically through <span class="term">$spec</span>; <span class="term">$config["style_pass"]</span>&#160;allowing letting through any <span class="term">style</span>&#160;value introduced; altered logic to catch certain types of dynamic crafted CSS expressions<br /> 1890&#160; 1.1.7 - 11-12 March 2009. Attributes globally denied through <span class="term">deny_attribute</span>&#160;can be allowed element-specifically through <span class="term">$spec</span>; <span class="term">$config["style_pass"]</span>&#160;allowing letting through any <span class="term">style</span>&#160;value introduced; altered logic to catch certain types of dynamic crafted CSS expressions<br />
1891<br /> 1891<br />
1892&#160; 1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions<br /> 1892&#160; 1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions<br />
1893<br /> 1893<br />
1894&#160; 1.1.2 - 22 January 2009. Fixed bug in parsing of <span class="term">font</span>&#160;attributes during tag transformation<br /> 1894&#160; 1.1.2 - 22 January 2009. Fixed bug in parsing of <span class="term">font</span>&#160;attributes during tag transformation<br />
1895<br /> 1895<br />
1896&#160; 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent<br /> 1896&#160; 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent<br />
1897<br /> 1897<br />
1898&#160; 1.1 - 29 June 2008. <span class="term">$config["hook_tag"]</span>&#160;and <span class="term">$config["tidy"]</span>&#160;introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug<br /> 1898&#160; 1.1 - 29 June 2008. <span class="term">$config["hook_tag"]</span>&#160;and <span class="term">$config["tidy"]</span>&#160;introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug<br />
1899<br /> 1899<br />
1900&#160; 1.0.9 - 11 June 2008. Fix for a bug in checks for invalid HTML code-point entities<br /> 1900&#160; 1.0.9 - 11 June 2008. Fix for a bug in checks for invalid HTML code-point entities<br />
1901<br /> 1901<br />
1902&#160; 1.0.8 - 15 May 2008. Permit <span class="term">bordercolor</span>&#160;attribute for <span class="term">table</span>, <span class="term">td</span>&#160;and <span class="term">tr</span><br /> 1902&#160; 1.0.8 - 15 May 2008. Permit <span class="term">bordercolor</span>&#160;attribute for <span class="term">table</span>, <span class="term">td</span>&#160;and <span class="term">tr</span><br />
1903<br /> 1903<br />
1904&#160; 1.0.7 - 1 May 2008. Support for <span class="term">wmode</span>&#160;attribute for <span class="term">embed</span>; <span class="term">$config["show_setting"]</span>&#160;introduced; improved <span class="term">$config["elements"]</span>&#160;evaluation<br /> 1904&#160; 1.0.7 - 1 May 2008. Support for <span class="term">wmode</span>&#160;attribute for <span class="term">embed</span>; <span class="term">$config["show_setting"]</span>&#160;introduced; improved <span class="term">$config["elements"]</span>&#160;evaluation<br />
1905<br /> 1905<br />
1906&#160; 1.0.6 - 20 April 2008. <span class="term">$config["and_mark"]</span>&#160;introduced<br /> 1906&#160; 1.0.6 - 20 April 2008. <span class="term">$config["and_mark"]</span>&#160;introduced<br />
1907<br /> 1907<br />
1908&#160; 1.0.5 - 12 March 2008. <span class="term">style</span>&#160;URL schemes essentially disallowed when $config <span class="term">safe</span>&#160;is on; improved regex for CSS expression search<br /> 1908&#160; 1.0.5 - 12 March 2008. <span class="term">style</span>&#160;URL schemes essentially disallowed when $config <span class="term">safe</span>&#160;is on; improved regex for CSS expression search<br />
1909<br /> 1909<br />
1910&#160; 1.0.4 - 10 March 2008. Improved corrections for <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span><br /> 1910&#160; 1.0.4 - 10 March 2008. Improved corrections for <span class="term">blockquote</span>, <span class="term">form</span>, <span class="term">map</span>&#160;and <span class="term">noscript</span><br />
1911<br /> 1911<br />
1912&#160; 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); fix for a bug allowing <span class="term">td</span>&#160;directly inside <span class="term">table</span>; <span class="term">$config["safe"]</span>&#160;introduced<br /> 1912&#160; 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); fix for a bug allowing <span class="term">td</span>&#160;directly inside <span class="term">table</span>; <span class="term">$config["safe"]</span>&#160;introduced<br />
1913<br /> 1913<br />
1914&#160; 1.0.2 - 13 February 2008. Improved implementation of <span class="term">$config["keep_bad"]</span><br /> 1914&#160; 1.0.2 - 13 February 2008. Improved implementation of <span class="term">$config["keep_bad"]</span><br />
1915<br /> 1915<br />
1916&#160; 1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions (<span class="term">hl_tag()</span>&#160;and <span class="term">hl_prot()</span>); no error display with <span class="term">hl_regex()</span><br /> 1916&#160; 1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions (<span class="term">hl_tag()</span>&#160;and <span class="term">hl_prot()</span>); no error display with <span class="term">hl_regex()</span><br />
1917<br /> 1917<br />
1918&#160; 1.0 - 2 November 2007. First release<br /> 1918&#160; 1.0 - 2 November 2007. First release<br />
1919 1919
1920</div> 1920</div>
1921<div class="sub-section"><h3> 1921<div class="sub-section"><h3>
1922<a name="s4.4" id="s4.4"></a><span class="item-no">4.4</span>&#160; Testing 1922<a name="s4.4" id="s4.4"></a><span class="item-no">4.4</span>&#160; Testing
1923</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1923</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1924<br /> 1924<br />
1925&#160; To test htmLawed using a form interface, a <a href="htmLawedTest.php">demo</a>&#160;web-page is provided with the htmLawed distribution (<span class="term">htmLawed.php</span>&#160;and <span class="term">htmLawedTest.php</span>&#160;should be in the same directory on the web-server). A file with <a href="htmLawed_TESTCASE.txt">test-cases</a>&#160;is also provided.<br /> 1925&#160; To test htmLawed using a form interface, a <a href="htmLawedTest.php">demo</a>&#160;web-page is provided with the htmLawed distribution (<span class="term">htmLawed.php</span>&#160;and <span class="term">htmLawedTest.php</span>&#160;should be in the same directory on the web-server). A file with <a href="htmLawed_TESTCASE.txt">test-cases</a>&#160;is also provided.<br />
1926 1926
1927</div> 1927</div>
1928<div class="sub-section"><h3> 1928<div class="sub-section"><h3>
1929<a name="s4.5" id="s4.5"></a><span class="item-no">4.5</span>&#160; Upgrade, &amp; old versions 1929<a name="s4.5" id="s4.5"></a><span class="item-no">4.5</span>&#160; Upgrade, &amp; old versions
1930</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1930</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1931<br /> 1931<br />
1932&#160; Upgrading is as simple as replacing the previous version of <span class="term">htmLawed.php</span>, assuming the file was not modified for customized features. As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content.<br /> 1932&#160; Upgrading is as simple as replacing the previous version of <span class="term">htmLawed.php</span>, assuming the file was not modified for customized features. As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content.<br />
1933<br /> 1933<br />
1934&#160; <strong>Note:</strong>&#160;The following upgrades may affect the functionality of a specific htmLawed installation:<br /> 1934&#160; <strong>Note:</strong>&#160;The following upgrades may affect the functionality of a specific htmLawed installation:<br />
1935<br /> 1935<br />
1936&#160; (1) From version 1.1-1.1.10 to 1.1.11 or later, if a <span class="term">hook_tag</span>&#160;function is in use: In version 1.1.11 and later, elements in closing tags (and not just the opening tags) are also passed to the function. There are no attribute names/values to pass, so a <span class="term">hook_tag</span>&#160;function receives only the element name. The <span class="term">hook_tag</span>&#160;function therefore may have to be edited. See <a href="#s3.4.9">section 3.4.9</a>.<br /> 1936&#160; (1) From version 1.1-1.1.10 to 1.1.11 or later, if a <span class="term">hook_tag</span>&#160;function is in use: In version 1.1.11 and later, elements in closing tags (and not just the opening tags) are also passed to the function. There are no attribute names/values to pass, so a <span class="term">hook_tag</span>&#160;function receives only the element name. The <span class="term">hook_tag</span>&#160;function therefore may have to be edited. See <a href="#s3.4.9">section 3.4.9</a>.<br />
1937<br /> 1937<br />
1938&#160; (2) From version older than 1.2.beta to later, if htmLawed was used as Kses replacement with Kses code in use: In version 1.2.beta or later, htmLawed no longer provides direct support for code that uses Kses functions (see <a href="#s2.6">section 2.6</a>).<br /> 1938&#160; (2) From version older than 1.2.beta to later, if htmLawed was used as Kses replacement with Kses code in use: In version 1.2.beta or later, htmLawed no longer provides direct support for code that uses Kses functions (see <a href="#s2.6">section 2.6</a>).<br />
1939<br /> 1939<br />
1940&#160; (3) From version older than 1.2 to later, if htmLawed is used without <span class="term">$config["safe"]</span>&#160;set to 1: Unlike previous versions, htmLawed version 1.2 and later permit <span class="term">data</span>&#160;and <span class="term">javascript</span>&#160;URL schemes by default (see <a href="#s3.4.3">section 3.4.3</a>).<br /> 1940&#160; (3) From version older than 1.2 to later, if htmLawed is used without <span class="term">$config["safe"]</span>&#160;set to 1: Unlike previous versions, htmLawed version 1.2 and later permit <span class="term">data</span>&#160;and <span class="term">javascript</span>&#160;URL schemes by default (see <a href="#s3.4.3">section 3.4.3</a>).<br />
1941<br /> 1941<br />
1942&#160; Old versions of htmLawed may be available online. E.g., for version 1.0, check <a href="http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip">http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip</a>; for 1.1.1, <a href="http://www.bioinformatics.org/phplabware/downloads/htmLawed111.zip">http://www.bioinformatics.org/phplabware/downloads/htmLawed111.zip</a>; and for 1.1.22, <a href="http://www.bioinformatics.org/phplabware/downloads/htmLawed1122.zip">http://www.bioinformatics.org/phplabware/downloads/htmLawed1122.zip</a>.<br /> 1942&#160; Old versions of htmLawed may be available online. E.g., for version 1.0, check <a href="http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip">http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip</a>; for 1.1.1, <a href="http://www.bioinformatics.org/phplabware/downloads/htmLawed111.zip">http://www.bioinformatics.org/phplabware/downloads/htmLawed111.zip</a>; and for 1.1.22, <a href="http://www.bioinformatics.org/phplabware/downloads/htmLawed1122.zip">http://www.bioinformatics.org/phplabware/downloads/htmLawed1122.zip</a>.<br />
1943 1943
1944</div> 1944</div>
1945<div class="sub-section"><h3> 1945<div class="sub-section"><h3>
1946<a name="s4.6" id="s4.6"></a><span class="item-no">4.6</span>&#160; Comparison with <span class="term">HTMLPurifier</span> 1946<a name="s4.6" id="s4.6"></a><span class="item-no">4.6</span>&#160; Comparison with <span class="term">HTMLPurifier</span>
1947</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1947</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1948<br /> 1948<br />
1949&#160; The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2015):<br /> 1949&#160; The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2015):<br />
1950<br /> 1950<br />
1951&#160; * &#160;does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)<br /> 1951&#160; * &#160;does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)<br />
1952<br /> 1952<br />
1953&#160; * &#160;is 15-20 times bigger (scores of files totalling more than 750 kb)<br /> 1953&#160; * &#160;is 15-20 times bigger (scores of files totalling more than 750 kb)<br />
1954<br /> 1954<br />
1955&#160; * &#160;consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory)<br /> 1955&#160; * &#160;consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory)<br />
1956<br /> 1956<br />
1957&#160; * &#160;is expectedly slower<br /> 1957&#160; * &#160;is expectedly slower<br />
1958<br /> 1958<br />
1959&#160; * &#160;lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification)<br /> 1959&#160; * &#160;lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification)<br />
1960<br /> 1960<br />
1961&#160; * &#160;has poor documentation<br /> 1961&#160; * &#160;has poor documentation<br />
1962<br /> 1962<br />
1963&#160; However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier <a href="http://htmlpurifier.org">website</a>&#160;for updated information.<br /> 1963&#160; However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier <a href="http://htmlpurifier.org">website</a>&#160;for updated information.<br />
1964 1964
1965</div> 1965</div>
1966<div class="sub-section"><h3> 1966<div class="sub-section"><h3>
1967<a name="s4.7" id="s4.7"></a><span class="item-no">4.7</span>&#160; Use through application plug-ins/modules 1967<a name="s4.7" id="s4.7"></a><span class="item-no">4.7</span>&#160; Use through application plug-ins/modules
1968</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1968</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1969<br /> 1969<br />
1970&#160; Plug-ins/modules to implement htmLawed in applications such as Drupal may have been developed. Check the application websites and the htmLawed <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">forum</a>.<br /> 1970&#160; Plug-ins/modules to implement htmLawed in applications such as Drupal may have been developed. Check the application websites and the htmLawed <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">forum</a>.<br />
1971 1971
1972</div> 1972</div>
1973<div class="sub-section"><h3> 1973<div class="sub-section"><h3>
1974<a name="s4.8" id="s4.8"></a><span class="item-no">4.8</span>&#160; Use in non-PHP applications 1974<a name="s4.8" id="s4.8"></a><span class="item-no">4.8</span>&#160; Use in non-PHP applications
1975</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1975</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1976<br /> 1976<br />
1977&#160; Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">site</a>.<br /> 1977&#160; Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed <a href="http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed">site</a>.<br />
1978 1978
1979</div> 1979</div>
1980<div class="sub-section"><h3> 1980<div class="sub-section"><h3>
1981<a name="s4.9" id="s4.9"></a><span class="item-no">4.9</span>&#160; Donate 1981<a name="s4.9" id="s4.9"></a><span class="item-no">4.9</span>&#160; Donate
1982</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1982</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1983<br /> 1983<br />
1984&#160; A donation in any currency and amount to appreciate or support this software can be sent by <a href="http://paypal.com">PayPal</a>&#160;to this email address: drpatnaik at yahoo dot com.<br /> 1984&#160; A donation in any currency and amount to appreciate or support this software can be sent by <a href="http://paypal.com">PayPal</a>&#160;to this email address: drpatnaik at yahoo dot com.<br />
1985 1985
1986</div> 1986</div>
1987<div class="sub-section"><h3> 1987<div class="sub-section"><h3>
1988<a name="s4.10" id="s4.10"></a><span class="item-no">4.10</span>&#160; Acknowledgements 1988<a name="s4.10" id="s4.10"></a><span class="item-no">4.10</span>&#160; Acknowledgements
1989</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1989</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
1990<br /> 1990<br />
1991&#160; Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Alexandre Chouinard, Ulf Harnhammer, Gareth Heyes, Hakre, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Psych0tr1a, Lincoln Russell, Tomas Sykorka, Harro Verton, Edward Yang, and many anonymous users.<br /> 1991&#160; Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Alexandre Chouinard, Ulf Harnhammer, Gareth Heyes, Hakre, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Psych0tr1a, Lincoln Russell, Tomas Sykorka, Harro Verton, Edward Yang, and many anonymous users.<br />
1992<br /> 1992<br />
1993&#160; Thank you!<br /> 1993&#160; Thank you!<br />
1994 1994
1995</div> 1995</div>
1996</div> 1996</div>
1997<div class="section"><h2> 1997<div class="section"><h2>
1998<a name="s5" id="s5"></a><span class="item-no">5</span>&#160; Appendices 1998<a name="s5" id="s5"></a><span class="item-no">5</span>&#160; Appendices
1999</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 1999</h2><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
2000<div class="sub-section"><h3> 2000<div class="sub-section"><h3>
2001<a name="s5.1" id="s5.1"></a><span class="item-no">5.1</span>&#160; Characters discouraged in XHTML 2001<a name="s5.1" id="s5.1"></a><span class="item-no">5.1</span>&#160; Characters discouraged in XHTML
2002</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 2002</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
2003<br /> 2003<br />
2004&#160; Characters represented by the following hexadecimal code-points are <em>not</em>&#160;invalid, even though some validators may issue messages stating otherwise.<br /> 2004&#160; Characters represented by the following hexadecimal code-points are <em>not</em>&#160;invalid, even though some validators may issue messages stating otherwise.<br />
2005<br /> 2005<br />
2006&#160; <span class="term">7f</span>&#160;to <span class="term">84</span>, <span class="term">86</span>&#160;to <span class="term">9f</span>, <span class="term">fdd0</span>&#160;to <span class="term">fddf</span>, <span class="term">1fffe</span>, <span class="term">1ffff</span>, <span class="term">2fffe</span>, <span class="term">2ffff</span>, <span class="term">3fffe</span>, <span class="term">3ffff</span>, <span class="term">4fffe</span>, <span class="term">4ffff</span>, <span class="term">5fffe</span>, <span class="term">5ffff</span>, <span class="term">6fffe</span>, <span class="term">6ffff</span>, <span class="term">7fffe</span>, <span class="term">7ffff</span>, <span class="term">8fffe</span>, <span class="term">8ffff</span>, <span class="term">9fffe</span>, <span class="term">9ffff</span>, <span class="term">afffe</span>, <span class="term">affff</span>, <span class="term">bfffe</span>, <span class="term">bffff</span>, <span class="term">cfffe</span>, <span class="term">cffff</span>, <span class="term">dfffe</span>, <span class="term">dffff</span>, <span class="term">efffe</span>, <span class="term">effff</span>, <span class="term">ffffe</span>, <span class="term">fffff</span>, <span class="term">10fffe</span>&#160;and <span class="term">10ffff</span><br /> 2006&#160; <span class="term">7f</span>&#160;to <span class="term">84</span>, <span class="term">86</span>&#160;to <span class="term">9f</span>, <span class="term">fdd0</span>&#160;to <span class="term">fddf</span>, <span class="term">1fffe</span>, <span class="term">1ffff</span>, <span class="term">2fffe</span>, <span class="term">2ffff</span>, <span class="term">3fffe</span>, <span class="term">3ffff</span>, <span class="term">4fffe</span>, <span class="term">4ffff</span>, <span class="term">5fffe</span>, <span class="term">5ffff</span>, <span class="term">6fffe</span>, <span class="term">6ffff</span>, <span class="term">7fffe</span>, <span class="term">7ffff</span>, <span class="term">8fffe</span>, <span class="term">8ffff</span>, <span class="term">9fffe</span>, <span class="term">9ffff</span>, <span class="term">afffe</span>, <span class="term">affff</span>, <span class="term">bfffe</span>, <span class="term">bffff</span>, <span class="term">cfffe</span>, <span class="term">cffff</span>, <span class="term">dfffe</span>, <span class="term">dffff</span>, <span class="term">efffe</span>, <span class="term">effff</span>, <span class="term">ffffe</span>, <span class="term">fffff</span>, <span class="term">10fffe</span>&#160;and <span class="term">10ffff</span><br />
2007 2007
2008</div> 2008</div>
2009<div class="sub-section"><h3> 2009<div class="sub-section"><h3>
2010<a name="s5.2" id="s5.2"></a><span class="item-no">5.2</span>&#160; Valid attribute-element combinations 2010<a name="s5.2" id="s5.2"></a><span class="item-no">5.2</span>&#160; Valid attribute-element combinations
2011</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 2011</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
2012<br /> 2012<br />
2013&#160; * &#160;includes deprecated attributes (marked <span class="term">^</span>), attributes for microdata (marked <span class="term">&#42;</span>), the non-standard <span class="term">bordercolor</span>, and new-in-HTML5 attributes (marked <span class="term">~</span>); can have multiple comma-separated values (marked <span class="term">%</span>); can have multiple space-separated values (marked <span class="term">$</span>)<br /> 2013&#160; * &#160;includes deprecated attributes (marked <span class="term">^</span>), attributes for microdata (marked <span class="term">&#42;</span>), the non-standard <span class="term">bordercolor</span>, and new-in-HTML5 attributes (marked <span class="term">~</span>); can have multiple comma-separated values (marked <span class="term">%</span>); can have multiple space-separated values (marked <span class="term">$</span>)<br />
2014&#160; * &#160;only non-frameset, HTML body elements<br /> 2014&#160; * &#160;only non-frameset, HTML body elements<br />
2015&#160; * &#160;<span class="term">name</span>&#160;for <span class="term">a</span>&#160;and <span class="term">map</span>, and <span class="term">lang</span>&#160;are invalid in XHTML 1.1<br /> 2015&#160; * &#160;<span class="term">name</span>&#160;for <span class="term">a</span>&#160;and <span class="term">map</span>, and <span class="term">lang</span>&#160;are invalid in XHTML 1.1<br />
2016&#160; * &#160;<span class="term">target</span>&#160;is valid for <span class="term">a</span>&#160;in XHTML 1.1 and higher<br /> 2016&#160; * &#160;<span class="term">target</span>&#160;is valid for <span class="term">a</span>&#160;in XHTML 1.1 and higher<br />
2017&#160; * &#160;<span class="term">xml&#58;space</span>&#160;is only for XHTML 1.1<br /> 2017&#160; * &#160;<span class="term">xml&#58;space</span>&#160;is only for XHTML 1.1<br />
2018<br /> 2018<br />
2019&#160; abbr - td, th<br /> 2019&#160; abbr - td, th<br />
2020&#160; accept - form, input<br /> 2020&#160; accept - form, input<br />
2021&#160; accept-charset - form<br /> 2021&#160; accept-charset - form<br />
2022&#160; action - form<br /> 2022&#160; action - form<br />
2023&#160; align - applet, caption^, col, colgroup, div^, embed, h1^, h2^, h3^, h4^, h5^, h6^, hr^, iframe, img^, input^, legend^, object^, p^, table^, tbody, td, tfoot, th, thead, tr<br /> 2023&#160; align - applet, caption^, col, colgroup, div^, embed, h1^, h2^, h3^, h4^, h5^, h6^, hr^, iframe, img^, input^, legend^, object^, p^, table^, tbody, td, tfoot, th, thead, tr<br />
2024&#160; allowfullscreen - iframe<br /> 2024&#160; allowfullscreen - iframe<br />
2025&#160; alt - applet, area, img, input<br /> 2025&#160; alt - applet, area, img, input<br />
2026&#160; archive - applet, object<br /> 2026&#160; archive - applet, object<br />
2027&#160; async~ - script<br /> 2027&#160; async~ - script<br />
2028&#160; autocomplete~ - input<br /> 2028&#160; autocomplete~ - input<br />
2029&#160; autofocus~ - button, input, keygen, select, textarea<br /> 2029&#160; autofocus~ - button, input, keygen, select, textarea<br />
2030&#160; autoplay~ - audio, video<br /> 2030&#160; autoplay~ - audio, video<br />
2031&#160; axis - td, th<br /> 2031&#160; axis - td, th<br />
2032&#160; bgcolor - embed, table^, td^, th^, tr^<br /> 2032&#160; bgcolor - embed, table^, td^, th^, tr^<br />
2033&#160; border - img, object^, table<br /> 2033&#160; border - img, object^, table<br />
2034&#160; bordercolor - table, td, tr<br /> 2034&#160; bordercolor - table, td, tr<br />
2035&#160; cellpadding - table<br /> 2035&#160; cellpadding - table<br />
2036&#160; cellspacing - table<br /> 2036&#160; cellspacing - table<br />
2037&#160; challenge~ - keygen<br /> 2037&#160; challenge~ - keygen<br />
2038&#160; char - col, colgroup, tbody, td, tfoot, th, thead, tr<br /> 2038&#160; char - col, colgroup, tbody, td, tfoot, th, thead, tr<br />
2039&#160; charoff - col, colgroup, tbody, td, tfoot, th, thead, tr<br /> 2039&#160; charoff - col, colgroup, tbody, td, tfoot, th, thead, tr<br />
2040&#160; charset - a, script<br /> 2040&#160; charset - a, script<br />
2041&#160; checked - command, input<br /> 2041&#160; checked - command, input<br />
2042&#160; cite - blockquote, del, ins, q<br /> 2042&#160; cite - blockquote, del, ins, q<br />
2043&#160; classid - object<br /> 2043&#160; classid - object<br />
2044&#160; clear - br^<br /> 2044&#160; clear - br^<br />
2045&#160; code - applet<br /> 2045&#160; code - applet<br />
2046&#160; codebase - object, applet<br /> 2046&#160; codebase - object, applet<br />
2047&#160; codetype - object<br /> 2047&#160; codetype - object<br />
2048&#160; color - font<br /> 2048&#160; color - font<br />
2049&#160; cols - textarea<br /> 2049&#160; cols - textarea<br />
2050&#160; colspan - td, th<br /> 2050&#160; colspan - td, th<br />
2051&#160; compact - dir, dl^, menu, ol^, ul^<br /> 2051&#160; compact - dir, dl^, menu, ol^, ul^<br />
2052&#160; content - meta<br /> 2052&#160; content - meta<br />
2053&#160; controls~ - audio, video<br /> 2053&#160; controls~ - audio, video<br />
2054&#160; coords - area, a<br /> 2054&#160; coords - area, a<br />
2055&#160; crossorigin~ - img<br /> 2055&#160; crossorigin~ - img<br />
2056&#160; data - object<br /> 2056&#160; data - object<br />
2057&#160; datetime - del, ins, time<br /> 2057&#160; datetime - del, ins, time<br />
2058&#160; declare - object<br /> 2058&#160; declare - object<br />
2059&#160; default~ - track<br /> 2059&#160; default~ - track<br />
2060&#160; defer - script<br /> 2060&#160; defer - script<br />
2061&#160; dir - bdo<br /> 2061&#160; dir - bdo<br />
2062&#160; dirname~ - input, textarea<br /> 2062&#160; dirname~ - input, textarea<br />
2063&#160; disabled - button, command, fieldset, input, keygen, optgroup, option, select, textarea<br /> 2063&#160; disabled - button, command, fieldset, input, keygen, optgroup, option, select, textarea<br />
2064&#160; download~ - a<br /> 2064&#160; download~ - a<br />
2065&#160; enctype - form<br /> 2065&#160; enctype - form<br />
2066&#160; face - font<br /> 2066&#160; face - font<br />
2067&#160; flashvars** - embed<br /> 2067&#160; flashvars** - embed<br />
2068&#160; for - label, output<br /> 2068&#160; for - label, output<br />
2069&#160; form~ - button, fieldset, input, keygen, label, object, output, select, textarea<br /> 2069&#160; form~ - button, fieldset, input, keygen, label, object, output, select, textarea<br />
2070&#160; formaction~ - button, input<br /> 2070&#160; formaction~ - button, input<br />
2071&#160; formenctype~ - button, input<br /> 2071&#160; formenctype~ - button, input<br />
2072&#160; formmethod~ - button, input<br /> 2072&#160; formmethod~ - button, input<br />
2073&#160; formnovalidate~ - button, input<br /> 2073&#160; formnovalidate~ - button, input<br />
2074&#160; formtarget~ - button, input<br /> 2074&#160; formtarget~ - button, input<br />
2075&#160; frame - table<br /> 2075&#160; frame - table<br />
2076&#160; frameborder - iframe<br /> 2076&#160; frameborder - iframe<br />
2077&#160; headers - td, th<br /> 2077&#160; headers - td, th<br />
2078&#160; height - applet, canvas, embed, iframe, img, input, object, td^, th^, video<br /> 2078&#160; height - applet, canvas, embed, iframe, img, input, object, td^, th^, video<br />
2079&#160; high~ - meter<br /> 2079&#160; high~ - meter<br />
2080&#160; href - a, area, link<br /> 2080&#160; href - a, area, link<br />
2081&#160; hreflang - a, area, link<br /> 2081&#160; hreflang - a, area, link<br />
2082&#160; hspace - applet, embed, img^, object^<br /> 2082&#160; hspace - applet, embed, img^, object^<br />
2083&#160; icon~ - command<br /> 2083&#160; icon~ - command<br />
2084&#160; ismap - img, input<br /> 2084&#160; ismap - img, input<br />
2085&#160; keytype~ - keygen<br /> 2085&#160; keytype~ - keygen<br />
2086&#160; keyparams~ - keygen<br /> 2086&#160; keyparams~ - keygen<br />
2087&#160; kind~ - track<br /> 2087&#160; kind~ - track<br />
2088&#160; label - command, menu, option, optgroup, track<br /> 2088&#160; label - command, menu, option, optgroup, track<br />
2089&#160; language - script^<br /> 2089&#160; language - script^<br />
2090&#160; list~ - input<br /> 2090&#160; list~ - input<br />
2091&#160; longdesc - img, iframe<br /> 2091&#160; longdesc - img, iframe<br />
2092&#160; loop~ - audio, video<br /> 2092&#160; loop~ - audio, video<br />
2093&#160; low~ - meter<br /> 2093&#160; low~ - meter<br />
2094&#160; marginheight - iframe<br /> 2094&#160; marginheight - iframe<br />
2095&#160; marginwidth - iframe<br /> 2095&#160; marginwidth - iframe<br />
2096&#160; max~ - input, meter, progress<br /> 2096&#160; max~ - input, meter, progress<br />
2097&#160; maxlength - input, textarea<br /> 2097&#160; maxlength - input, textarea<br />
2098&#160; media~ - a, area, link, source, style<br /> 2098&#160; media~ - a, area, link, source, style<br />
2099&#160; mediagroup~ - audio, video<br /> 2099&#160; mediagroup~ - audio, video<br />
2100&#160; method - form<br /> 2100&#160; method - form<br />
2101&#160; min~ - input, meter<br /> 2101&#160; min~ - input, meter<br />
2102&#160; model** - embed<br /> 2102&#160; model** - embed<br />
2103&#160; multiple - input, select<br /> 2103&#160; multiple - input, select<br />
2104&#160; muted~ - audio, video<br /> 2104&#160; muted~ - audio, video<br />
2105&#160; name - a^, applet^, button, embed, fieldset, form^, iframe^, img^, input, keygen, map^, object, output, param, select, textarea<br /> 2105&#160; name - a^, applet^, button, embed, fieldset, form^, iframe^, img^, input, keygen, map^, object, output, param, select, textarea<br />
2106&#160; nohref - area<br /> 2106&#160; nohref - area<br />
2107&#160; noshade - hr^<br /> 2107&#160; noshade - hr^<br />
2108&#160; novalidate~ - form<br /> 2108&#160; novalidate~ - form<br />
2109&#160; nowrap - td^, th^<br /> 2109&#160; nowrap - td^, th^<br />
2110&#160; object - applet<br /> 2110&#160; object - applet<br />
2111&#160; open~ - details<br /> 2111&#160; open~ - details<br />
2112&#160; optimum~ - meter<br /> 2112&#160; optimum~ - meter<br />
2113&#160; pattern~ - input<br /> 2113&#160; pattern~ - input<br />
2114&#160; ping~ - a, area<br /> 2114&#160; ping~ - a, area<br />
2115&#160; placeholder~ - input, textarea<br /> 2115&#160; placeholder~ - input, textarea<br />
2116&#160; pluginspage** - embed<br /> 2116&#160; pluginspage** - embed<br />
2117&#160; pluginurl** - embed<br /> 2117&#160; pluginurl** - embed<br />
2118&#160; poster~ - video<br /> 2118&#160; poster~ - video<br />
2119&#160; pqg~ - keygen<br /> 2119&#160; pqg~ - keygen<br />
2120&#160; preload~ - audio, video<br /> 2120&#160; preload~ - audio, video<br />
2121&#160; prompt - isindex<br /> 2121&#160; prompt - isindex<br />
2122&#160; pubdate~ - time<br /> 2122&#160; pubdate~ - time<br />
2123&#160; radiogroup* - command<br /> 2123&#160; radiogroup* - command<br />
2124&#160; readonly - input, textarea<br /> 2124&#160; readonly - input, textarea<br />
2125&#160; required~ - input, select, textarea<br /> 2125&#160; required~ - input, select, textarea<br />
2126&#160; rel$ - a, area, link<br /> 2126&#160; rel$ - a, area, link<br />
2127&#160; rev - a<br /> 2127&#160; rev - a<br />
2128&#160; reversed~ - old<br /> 2128&#160; reversed~ - old<br />
2129&#160; rows - textarea<br /> 2129&#160; rows - textarea<br />
2130&#160; rowspan - td, th<br /> 2130&#160; rowspan - td, th<br />
2131&#160; rules - table<br /> 2131&#160; rules - table<br />
2132&#160; sandbox~ - iframe<br /> 2132&#160; sandbox~ - iframe<br />
2133&#160; scope - td, th<br /> 2133&#160; scope - td, th<br />
2134&#160; scoped~ - style<br /> 2134&#160; scoped~ - style<br />
2135&#160; scrolling - iframe<br /> 2135&#160; scrolling - iframe<br />
2136&#160; seamless~ - iframe<br /> 2136&#160; seamless~ - iframe<br />
2137&#160; selected - option<br /> 2137&#160; selected - option<br />
2138&#160; shape - area, a<br /> 2138&#160; shape - area, a<br />
2139&#160; size - font, hr^, input, select<br /> 2139&#160; size - font, hr^, input, select<br />
2140&#160; sizes~ - link<br /> 2140&#160; sizes~ - link<br />
2141&#160; span - col, colgroup<br /> 2141&#160; span - col, colgroup<br />
2142&#160; src - audio, embed, iframe, img, input, script, source, track, video<br /> 2142&#160; src - audio, embed, iframe, img, input, script, source, track, video<br />
2143&#160; srcdoc~ - iframe<br /> 2143&#160; srcdoc~ - iframe<br />
2144&#160; srclang~ - track<br /> 2144&#160; srclang~ - track<br />
2145&#160; srcset~% - img<br /> 2145&#160; srcset~% - img<br />
2146&#160; standby - object<br /> 2146&#160; standby - object<br />
2147&#160; start - ol<br /> 2147&#160; start - ol<br />
2148&#160; step~ - input<br /> 2148&#160; step~ - input<br />
2149&#160; summary - table<br /> 2149&#160; summary - table<br />
2150&#160; target - a, area, form<br /> 2150&#160; target - a, area, form<br />
2151&#160; type - a, area, button, command, embed, input, li, link, menu, object, ol, param, script, source, style, ul<br /> 2151&#160; type - a, area, button, command, embed, input, li, link, menu, object, ol, param, script, source, style, ul<br />
2152&#160; typemustmatch~ - object<br /> 2152&#160; typemustmatch~ - object<br />
2153&#160; usemap - img, input, object<br /> 2153&#160; usemap - img, input, object<br />
2154&#160; valign - col, colgroup, tbody, td, tfoot, th, thead, tr<br /> 2154&#160; valign - col, colgroup, tbody, td, tfoot, th, thead, tr<br />
2155&#160; value - button, data, input, li, meter, option, param, progress<br /> 2155&#160; value - button, data, input, li, meter, option, param, progress<br />
2156&#160; valuetype - param<br /> 2156&#160; valuetype - param<br />
2157&#160; vspace - applet, embed, img^, object^<br /> 2157&#160; vspace - applet, embed, img^, object^<br />
2158&#160; width - applet, canvas, col, colgroup, embed, hr^, iframe, img, input, object, pre^, table, td^, th^, video<br /> 2158&#160; width - applet, canvas, col, colgroup, embed, hr^, iframe, img, input, object, pre^, table, td^, th^, video<br />
2159&#160; wmode - embed<br /> 2159&#160; wmode - embed<br />
2160&#160; wrap~ - textarea<br /> 2160&#160; wrap~ - textarea<br />
2161<br /> 2161<br />
2162&#160; The following attributes, including event-specific ones and attributes of ARIA and microdata specifications, are considered global and allowed in all elements:<br /> 2162&#160; The following attributes, including event-specific ones and attributes of ARIA and microdata specifications, are considered global and allowed in all elements:<br />
2163<br /> 2163<br />
2164&#160; accesskey, aria-activedescendant, aria-atomic, aria-autocomplete, aria-busy, aria-checked, aria-controls, aria-describedby, aria-disabled, aria-dropeffect, aria-expanded, aria-flowto, aria-grabbed, aria-haspopup, aria-hidden, aria-invalid, aria-label, aria-labelledby, aria-level, aria-live, aria-multiline, aria-multiselectable, aria-orientation, aria-owns, aria-posinset, aria-pressed, aria-readonly, aria-relevant, aria-required, aria-selected, aria-setsize, aria-sort, aria-valuemax, aria-valuemin, aria-valuenow, aria-valuetext, class$, contenteditable, contextmenu, dir, draggable, dropzone, hidden, id, inert, itemid, itemprop, itemref, itemscope, itemtype, lang, onabort, onblur, oncanplay, oncanplaythrough, onchange, onclick, oncontextmenu, oncopy, oncuechange, oncut, ondblclick, ondrag, ondragend, ondragenter, ondragleave, ondragover, ondragstart, ondrop, ondurationchange, onemptied, onended, onerror, onfocus, onformchange, onforminput, oninput, oninvalid, onkeydown, onkeypress, onkeyup, onload, onloadeddata, onloadedmetadata, onloadstart, onlostpointercapture, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onmousewheel, onpaste, onpause, onplay, onplaying, onpointercancel, ongotpointercapture, onpointerdown, onpointerenter, onpointerleave, onpointermove, onpointerout, onpointerover, onpointerup, onprogress, onratechange, onreadystatechange, onreset, onsearch, onscroll, onseeked, onseeking, onselect, onshow, onstalled, onsubmit, onsuspend, ontimeupdate, ontoggle, ontouchcancel, ontouchend, ontouchmove, ontouchstart, onvolumechange, onwaiting, onwheel, role, spellcheck, style, tabindex, title, translate, xmlns, xml:base, xml:lang, xml:space<br /> 2164&#160; accesskey, aria-activedescendant, aria-atomic, aria-autocomplete, aria-busy, aria-checked, aria-controls, aria-describedby, aria-disabled, aria-dropeffect, aria-expanded, aria-flowto, aria-grabbed, aria-haspopup, aria-hidden, aria-invalid, aria-label, aria-labelledby, aria-level, aria-live, aria-multiline, aria-multiselectable, aria-orientation, aria-owns, aria-posinset, aria-pressed, aria-readonly, aria-relevant, aria-required, aria-selected, aria-setsize, aria-sort, aria-valuemax, aria-valuemin, aria-valuenow, aria-valuetext, class$, contenteditable, contextmenu, dir, draggable, dropzone, hidden, id, inert, itemid, itemprop, itemref, itemscope, itemtype, lang, onabort, onblur, oncanplay, oncanplaythrough, onchange, onclick, oncontextmenu, oncopy, oncuechange, oncut, ondblclick, ondrag, ondragend, ondragenter, ondragleave, ondragover, ondragstart, ondrop, ondurationchange, onemptied, onended, onerror, onfocus, onformchange, onforminput, oninput, oninvalid, onkeydown, onkeypress, onkeyup, onload, onloadeddata, onloadedmetadata, onloadstart, onlostpointercapture, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onmousewheel, onpaste, onpause, onplay, onplaying, onpointercancel, ongotpointercapture, onpointerdown, onpointerenter, onpointerleave, onpointermove, onpointerout, onpointerover, onpointerup, onprogress, onratechange, onreadystatechange, onreset, onsearch, onscroll, onseeked, onseeking, onselect, onshow, onstalled, onsubmit, onsuspend, ontimeupdate, ontoggle, ontouchcancel, ontouchend, ontouchmove, ontouchstart, onvolumechange, onwaiting, onwheel, role, spellcheck, style, tabindex, title, translate, xmlns, xml:base, xml:lang, xml:space<br />
2165<br /> 2165<br />
2166&#160; Custom <em>data-*</em>&#160;attributes, where the first three characters of the value of <em>star</em>&#160;(*) after lower-casing do not equal <span class="term">xml</span>&#160;and the value of <em>star</em>&#160;does not have a colon (:), equal-to (=), newline, solidus (/), space, tab, or any A-Z character, are also considered global and allowed in all elements.<br /> 2166&#160; Custom <em>data-*</em>&#160;attributes, where the first three characters of the value of <em>star</em>&#160;(*) after lower-casing do not equal <span class="term">xml</span>&#160;and the value of <em>star</em>&#160;does not have a colon (:), equal-to (=), newline, solidus (/), space, tab, or any A-Z character, are also considered global and allowed in all elements.<br />
2167 2167
2168</div> 2168</div>
2169<div class="sub-section"><h3> 2169<div class="sub-section"><h3>
2170<a name="s5.3" id="s5.3"></a><span class="item-no">5.3</span>&#160; CSS 2.1 properties accepting URLs 2170<a name="s5.3" id="s5.3"></a><span class="item-no">5.3</span>&#160; CSS 2.1 properties accepting URLs
2171</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 2171</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
2172<br /> 2172<br />
2173&#160; background<br /> 2173&#160; background<br />
2174&#160; background-image<br /> 2174&#160; background-image<br />
2175&#160; content<br /> 2175&#160; content<br />
2176&#160; cue-after<br /> 2176&#160; cue-after<br />
2177&#160; cue-before<br /> 2177&#160; cue-before<br />
2178&#160; cursor<br /> 2178&#160; cursor<br />
2179&#160; list-style<br /> 2179&#160; list-style<br />
2180&#160; list-style-image<br /> 2180&#160; list-style-image<br />
2181&#160; play-during<br /> 2181&#160; play-during<br />
2182 2182
2183</div> 2183</div>
2184<div class="sub-section"><h3> 2184<div class="sub-section"><h3>
2185<a name="s5.4" id="s5.4"></a><span class="item-no">5.4</span>&#160; Microsoft Windows 1252 character replacements 2185<a name="s5.4" id="s5.4"></a><span class="item-no">5.4</span>&#160; Microsoft Windows 1252 character replacements
2186</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 2186</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
2187<br /> 2187<br />
2188&#160; Key: <span class="term">d</span>&#160;double, <span class="term">l</span>&#160;left, <span class="term">q</span>&#160;quote, <span class="term">r</span>&#160;right, <span class="term">s.</span>&#160;single<br /> 2188&#160; Key: <span class="term">d</span>&#160;double, <span class="term">l</span>&#160;left, <span class="term">q</span>&#160;quote, <span class="term">r</span>&#160;right, <span class="term">s.</span>&#160;single<br />
2189<br /> 2189<br />
2190&#160; Code-point (decimal) - hexadecimal value - replacement entity - represented character<br /> 2190&#160; Code-point (decimal) - hexadecimal value - replacement entity - represented character<br />
2191<br /> 2191<br />
2192&#160; 127 - 7f - (removed) - (not used)<br /> 2192&#160; 127 - 7f - (removed) - (not used)<br />
2193&#160; 128 - 80 - &amp;#8364; - euro<br /> 2193&#160; 128 - 80 - &amp;#8364; - euro<br />
2194&#160; 129 - 81 - (removed) - (not used)<br /> 2194&#160; 129 - 81 - (removed) - (not used)<br />
2195&#160; 130 - 82 - &amp;#8218; - baseline s. q<br /> 2195&#160; 130 - 82 - &amp;#8218; - baseline s. q<br />
2196&#160; 131 - 83 - &amp;#402; - florin<br /> 2196&#160; 131 - 83 - &amp;#402; - florin<br />
2197&#160; 132 - 84 - &amp;#8222; - baseline d q<br /> 2197&#160; 132 - 84 - &amp;#8222; - baseline d q<br />
2198&#160; 133 - 85 - &amp;#8230; - ellipsis<br /> 2198&#160; 133 - 85 - &amp;#8230; - ellipsis<br />
2199&#160; 134 - 86 - &amp;#8224; - dagger<br /> 2199&#160; 134 - 86 - &amp;#8224; - dagger<br />
2200&#160; 135 - 87 - &amp;#8225; - d dagger<br /> 2200&#160; 135 - 87 - &amp;#8225; - d dagger<br />
2201&#160; 136 - 88 - &amp;#710; - circumflex accent<br /> 2201&#160; 136 - 88 - &amp;#710; - circumflex accent<br />
2202&#160; 137 - 89 - &amp;#8240; - permile<br /> 2202&#160; 137 - 89 - &amp;#8240; - permile<br />
2203&#160; 138 - 8a - &amp;#352; - S Hacek<br /> 2203&#160; 138 - 8a - &amp;#352; - S Hacek<br />
2204&#160; 139 - 8b - &amp;#8249; - l s. guillemet<br /> 2204&#160; 139 - 8b - &amp;#8249; - l s. guillemet<br />
2205&#160; 140 - 8c - &amp;#338; - OE ligature<br /> 2205&#160; 140 - 8c - &amp;#338; - OE ligature<br />
2206&#160; 141 - 8d - (removed) - (not used)<br /> 2206&#160; 141 - 8d - (removed) - (not used)<br />
2207&#160; 142 - 8e - &amp;#381; - Z dieresis<br /> 2207&#160; 142 - 8e - &amp;#381; - Z dieresis<br />
2208&#160; 143 - 8f - (removed) - (not used)<br /> 2208&#160; 143 - 8f - (removed) - (not used)<br />
2209&#160; 144 - 90 - (removed) - (not used)<br /> 2209&#160; 144 - 90 - (removed) - (not used)<br />
2210&#160; 145 - 91 - &amp;#8216; - l s. q<br /> 2210&#160; 145 - 91 - &amp;#8216; - l s. q<br />
2211&#160; 146 - 92 - &amp;#8217; - r s. q<br /> 2211&#160; 146 - 92 - &amp;#8217; - r s. q<br />
2212&#160; 147 - 93 - &amp;#8220; - l d q<br /> 2212&#160; 147 - 93 - &amp;#8220; - l d q<br />
2213&#160; 148 - 94 - &amp;#8221; - r d q<br /> 2213&#160; 148 - 94 - &amp;#8221; - r d q<br />
2214&#160; 149 - 95 - &amp;#8226; - bullet<br /> 2214&#160; 149 - 95 - &amp;#8226; - bullet<br />
2215&#160; 150 - 96 - &amp;#8211; - en dash<br /> 2215&#160; 150 - 96 - &amp;#8211; - en dash<br />
2216&#160; 151 - 97 - &amp;#8212; - em dash<br /> 2216&#160; 151 - 97 - &amp;#8212; - em dash<br />
2217&#160; 152 - 98 - &amp;#732; - tilde accent<br /> 2217&#160; 152 - 98 - &amp;#732; - tilde accent<br />
2218&#160; 153 - 99 - &amp;#8482; - trademark<br /> 2218&#160; 153 - 99 - &amp;#8482; - trademark<br />
2219&#160; 154 - 9a - &amp;#353; - s Hacek<br /> 2219&#160; 154 - 9a - &amp;#353; - s Hacek<br />
2220&#160; 155 - 9b - &amp;#8250; - r s. guillemet<br /> 2220&#160; 155 - 9b - &amp;#8250; - r s. guillemet<br />
2221&#160; 156 - 9c - &amp;#339; - oe ligature<br /> 2221&#160; 156 - 9c - &amp;#339; - oe ligature<br />
2222&#160; 157 - 9d - (removed) - (not used)<br /> 2222&#160; 157 - 9d - (removed) - (not used)<br />
2223&#160; 158 - 9e - &amp;#382; - z dieresis<br /> 2223&#160; 158 - 9e - &amp;#382; - z dieresis<br />
2224&#160; 159 - 9f - &amp;#376; - Y dieresis<br /> 2224&#160; 159 - 9f - &amp;#376; - Y dieresis<br />
2225 2225
2226</div> 2226</div>
2227<div class="sub-section"><h3> 2227<div class="sub-section"><h3>
2228<a name="s5.5" id="s5.5"></a><span class="item-no">5.5</span>&#160; URL format 2228<a name="s5.5" id="s5.5"></a><span class="item-no">5.5</span>&#160; URL format
2229</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 2229</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
2230<br /> 2230<br />
2231&#160; An <em>absolute</em>&#160;URL has a <span class="term">protocol</span>&#160;or <span class="term">scheme</span>, a <span class="term">network location</span>&#160;or <span class="term">hostname</span>, and, optional <span class="term">path</span>, <span class="term">parameters</span>, <span class="term">query</span>&#160;and <span class="term">fragment</span>&#160;segments. Thus, an absolute URL has this generic structure:<br /> 2231&#160; An <em>absolute</em>&#160;URL has a <span class="term">protocol</span>&#160;or <span class="term">scheme</span>, a <span class="term">network location</span>&#160;or <span class="term">hostname</span>, and, optional <span class="term">path</span>, <span class="term">parameters</span>, <span class="term">query</span>&#160;and <span class="term">fragment</span>&#160;segments. Thus, an absolute URL has this generic structure:<br />
2232<br /> 2232<br />
2233 2233
2234<code class="code">&#160; &#160; (scheme) &#58; (//network location) /(path) ;(parameters) ?(query) #(fragment)</code> 2234<code class="code">&#160; &#160; (scheme) &#58; (//network location) /(path) ;(parameters) ?(query) #(fragment)</code>
2235<br /> 2235<br />
2236<br /> 2236<br />
2237&#160; The schemes can only contain letters, digits, <span class="term">+</span>, <span class="term">.</span>&#160;and <span class="term">-</span>. Hostname is the portion after the <span class="term">//</span>&#160;and up to the first <span class="term">/</span>&#160;(if any; else, up to the end) when <span class="term">&#58;</span>&#160;is followed by a <span class="term">//</span>&#160;(e.g., <span class="term">abc.com</span>&#160;in <span class="term">ftp&#58;//abc.com/def</span>); otherwise, it consists of everything after the <span class="term">&#58;</span>&#160;(e.g., <span class="term">def@abc.com</span>&#160;in mailto:def@abc.com').<br /> 2237&#160; The schemes can only contain letters, digits, <span class="term">+</span>, <span class="term">.</span>&#160;and <span class="term">-</span>. Hostname is the portion after the <span class="term">//</span>&#160;and up to the first <span class="term">/</span>&#160;(if any; else, up to the end) when <span class="term">&#58;</span>&#160;is followed by a <span class="term">//</span>&#160;(e.g., <span class="term">abc.com</span>&#160;in <span class="term">ftp&#58;//abc.com/def</span>); otherwise, it consists of everything after the <span class="term">&#58;</span>&#160;(e.g., <span class="term">def@abc.com</span>&#160;in mailto:def@abc.com').<br />
2238<br /> 2238<br />
2239&#160; <em>Relative</em>&#160;URLs do not have explicit schemes and network locations; such values are inherited from a <em>base</em>&#160;URL.<br /> 2239&#160; <em>Relative</em>&#160;URLs do not have explicit schemes and network locations; such values are inherited from a <em>base</em>&#160;URL.<br />
2240 2240
2241</div> 2241</div>
2242<div class="sub-section"><h3> 2242<div class="sub-section"><h3>
2243<a name="s5.6" id="s5.6"></a><span class="item-no">5.6</span>&#160; Brief on htmLawed code 2243<a name="s5.6" id="s5.6"></a><span class="item-no">5.6</span>&#160; Brief on htmLawed code
2244</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" /> 2244</h3><span class="totop"><a href="#peak">(to top)</a></span><br style="clear: both;" />
2245<br /> 2245<br />
2246&#160; Much of the code's logic and reasoning can be understood from the documentation above.<br /> 2246&#160; Much of the code's logic and reasoning can be understood from the documentation above.<br />
2247<br /> 2247<br />
2248&#160; The <strong>output</strong>&#160;of htmLawed is a text string containing the processed input. There is no custom error tracking.<br /> 2248&#160; The <strong>output</strong>&#160;of htmLawed is a text string containing the processed input. There is no custom error tracking.<br />
2249<br /> 2249<br />
2250&#160; <strong>Function arguments</strong>&#160;for htmLawed are:<br /> 2250&#160; <strong>Function arguments</strong>&#160;for htmLawed are:<br />
2251<br /> 2251<br />
2252&#160; * &#160;<span class="term">$in</span>&#160;- first argument; a text string; the <strong>input text</strong>&#160;to be processed. Any extraneous slashes added by PHP when <em>magic quotes</em>&#160;are enabled should be removed beforehand using PHP's <span class="term">stripslashes()</span>&#160;function.<br /> 2252&#160; * &#160;<span class="term">$in</span>&#160;- first argument; a text string; the <strong>input text</strong>&#160;to be processed. Any extraneous slashes added by PHP when <em>magic quotes</em>&#160;are enabled should be removed beforehand using PHP's <span class="term">stripslashes()</span>&#160;function.<br />
2253<br /> 2253<br />
2254&#160; * &#160;<span class="term">$config</span>&#160;- second argument; an associative array; optional; named <span class="term">$C</span>&#160;within htmLawed code. The array has keys with names like <span class="term">balance</span>&#160;and <span class="term">keep_bad</span>, and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the <strong>configurable parameters</strong>&#160;(indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through <span class="term">$config</span>. <em>Finalized</em>&#160;<span class="term">$config</span>&#160;is thus a filtered and possibly larger array.<br /> 2254&#160; * &#160;<span class="term">$config</span>&#160;- second argument; an associative array; optional; named <span class="term">$C</span>&#160;within htmLawed code. The array has keys with names like <span class="term">balance</span>&#160;and <span class="term">keep_bad</span>, and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the <strong>configurable parameters</strong>&#160;(indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through <span class="term">$config</span>. <em>Finalized</em>&#160;<span class="term">$config</span>&#160;is thus a filtered and possibly larger array.<br />
2255<br /> 2255<br />
2256&#160; * &#160;<span class="term">$spec</span>&#160;- third argument; a text string; optional. The string has rules, written in an htmLawed-designated format, <strong>specifying</strong>&#160;element-specific attribute and attribute value restrictions. Function <span class="term">hl_spec()</span>&#160;is used to convert the string to an associative-array, named <span class="term">$S</span>&#160;within htmLawed code, for internal use. <em>Finalized</em>&#160;<span class="term">$spec</span>&#160;is thus an array.<br /> 2256&#160; * &#160;<span class="term">$spec</span>&#160;- third argument; a text string; optional. The string has rules, written in an htmLawed-designated format, <strong>specifying</strong>&#160;element-specific attribute and attribute value restrictions. Function <span class="term">hl_spec()</span>&#160;is used to convert the string to an associative-array, named <span class="term">$S</span>&#160;within htmLawed code, for internal use. <em>Finalized</em>&#160;<span class="term">$spec</span>&#160;is thus an array.<br />
2257<br /> 2257<br />
2258&#160; <em>Finalized</em>&#160;<span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;are made <strong>global variables</strong>&#160;while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the <em>finalized</em>&#160;values, the <span class="term">show_settings</span>&#160;parameter of <span class="term">$config</span>&#160;should be used). Depending on <span class="term">$config</span>, another global variable <span class="term">hl_Ids</span>, to track <span class="term">id</span>&#160;attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing.<br /> 2258&#160; <em>Finalized</em>&#160;<span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;are made <strong>global variables</strong>&#160;while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the <em>finalized</em>&#160;values, the <span class="term">show_settings</span>&#160;parameter of <span class="term">$config</span>&#160;should be used). Depending on <span class="term">$config</span>, another global variable <span class="term">hl_Ids</span>, to track <span class="term">id</span>&#160;attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing.<br />
2259<br /> 2259<br />
2260&#160; Except for the main <span class="term">htmLawed()</span>&#160;function, htmLawed's functions are <strong>name-spaced</strong>&#160;using the <span class="term">hl_</span>&#160;prefix. The <strong>functions</strong>&#160;and their roles are:<br /> 2260&#160; Except for the main <span class="term">htmLawed()</span>&#160;function, htmLawed's functions are <strong>name-spaced</strong>&#160;using the <span class="term">hl_</span>&#160;prefix. The <strong>functions</strong>&#160;and their roles are:<br />
2261<br /> 2261<br />
2262&#160; * &#160;<span class="term">hl_attrval</span>&#160;- check attribute values against <span class="term">$spec</span><br /> 2262&#160; * &#160;<span class="term">hl_attrval</span>&#160;- check attribute values against <span class="term">$spec</span><br />
2263&#160; * &#160;<span class="term">hl_bal</span>&#160;- balance tags and ensure proper nesting<br /> 2263&#160; * &#160;<span class="term">hl_bal</span>&#160;- balance tags and ensure proper nesting<br />
2264&#160; * &#160;<span class="term">hl_cmtcd</span>&#160;- handle CDATA sections and HTML comments<br /> 2264&#160; * &#160;<span class="term">hl_cmtcd</span>&#160;- handle CDATA sections and HTML comments<br />
2265&#160; * &#160;<span class="term">hl_ent</span>&#160;- handle character entities<br /> 2265&#160; * &#160;<span class="term">hl_ent</span>&#160;- handle character entities<br />
2266&#160; * &#160;<span class="term">hl_prot</span>&#160;- check a URL scheme/protocol<br /> 2266&#160; * &#160;<span class="term">hl_prot</span>&#160;- check a URL scheme/protocol<br />
2267&#160; * &#160;<span class="term">hl_regex</span>&#160;- check syntax of a regular expression<br /> 2267&#160; * &#160;<span class="term">hl_regex</span>&#160;- check syntax of a regular expression<br />
2268&#160; * &#160;<span class="term">hl_spec</span>&#160;- convert user-supplied <span class="term">$spec</span>&#160;value to one used internally<br /> 2268&#160; * &#160;<span class="term">hl_spec</span>&#160;- convert user-supplied <span class="term">$spec</span>&#160;value to one used internally<br />
2269&#160; * &#160;<span class="term">hl_tag</span>&#160;- handle element tags and attributes<br /> 2269&#160; * &#160;<span class="term">hl_tag</span>&#160;- handle element tags and attributes<br />
2270&#160; * &#160;<span class="term">hl_tag2</span>&#160;- transform element tags<br /> 2270&#160; * &#160;<span class="term">hl_tag2</span>&#160;- transform element tags<br />
2271&#160; * &#160;<span class="term">hl_tidy</span>&#160;- compact/beautify HTML<br /> 2271&#160; * &#160;<span class="term">hl_tidy</span>&#160;- compact/beautify HTML<br />
2272&#160; * &#160;<span class="term">hl_version</span>&#160;- report htmLawed version<br /> 2272&#160; * &#160;<span class="term">hl_version</span>&#160;- report htmLawed version<br />
2273&#160; * &#160;<span class="term">htmLawed</span>&#160;- main function<br /> 2273&#160; * &#160;<span class="term">htmLawed</span>&#160;- main function<br />
2274<br /> 2274<br />
2275&#160; <span class="term">htmLawed()</span>&#160;finalizes <span class="term">$spec</span>&#160;(with the help of <span class="term">hl_spec()</span>) and <span class="term">$config</span>, and globalizes them. Finalization of <span class="term">$config</span>&#160;involves setting default values if an inappropriate or invalid one is supplied. This includes calling <span class="term">hl_regex()</span>&#160;to check well-formedness of regular expression patterns if such expressions are user-supplied through <span class="term">$config</span>. <span class="term">htmLawed()</span>&#160;then removes invalid characters like nulls and <span class="term">x01</span>&#160;and appropriately handles entities using <span class="term">hl_ent()</span>. HTML comments and CDATA sections are identified and treated as per <span class="term">$config</span>&#160;with the help of <span class="term">hl_cmtcd()</span>. When retained, the <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;characters identifying them, and the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters inside them, are replaced with control characters (code-points <span class="term">1</span>&#160;to <span class="term">5</span>) till any tag balancing is completed.<br /> 2275&#160; <span class="term">htmLawed()</span>&#160;finalizes <span class="term">$spec</span>&#160;(with the help of <span class="term">hl_spec()</span>) and <span class="term">$config</span>, and globalizes them. Finalization of <span class="term">$config</span>&#160;involves setting default values if an inappropriate or invalid one is supplied. This includes calling <span class="term">hl_regex()</span>&#160;to check well-formedness of regular expression patterns if such expressions are user-supplied through <span class="term">$config</span>. <span class="term">htmLawed()</span>&#160;then removes invalid characters like nulls and <span class="term">x01</span>&#160;and appropriately handles entities using <span class="term">hl_ent()</span>. HTML comments and CDATA sections are identified and treated as per <span class="term">$config</span>&#160;with the help of <span class="term">hl_cmtcd()</span>. When retained, the <span class="term">&lt;</span>&#160;and <span class="term">&gt;</span>&#160;characters identifying them, and the <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters inside them, are replaced with control characters (code-points <span class="term">1</span>&#160;to <span class="term">5</span>) till any tag balancing is completed.<br />
2276<br /> 2276<br />
2277&#160; After this <em>initial processing</em>&#160;<span class="term">htmLawed()</span>&#160;identifies tags using regex and processes them with the help of <span class="term">hl_tag()</span>&#160;-- &#160;a large function that analyzes tag content, filtering it as per HTML standards, <span class="term">$config</span>&#160;and <span class="term">$spec</span>. Among other things, <span class="term">hl_tag()</span>&#160;transforms deprecated elements using <span class="term">hl_tag2()</span>, removes attributes from closing tags, checks attribute values as per <span class="term">$spec</span>&#160;rules using <span class="term">hl_attrval()</span>, and checks URL protocols using <span class="term">hl_prot()</span>. <span class="term">htmLawed()</span>&#160;performs tag balancing and nesting checks with a call to <span class="term">hl_bal()</span>, and optionally compacts/beautifies the output with proper white-spacing with a call to <span class="term">hl_tidy()</span>. The latter temporarily replaces white-space, and <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters inside <span class="term">pre</span>, <span class="term">script</span>&#160;and <span class="term">textarea</span>&#160;elements, and HTML comments and CDATA sections with control characters (code-points <span class="term">1</span>&#160;to <span class="term">5</span>, and <span class="term">7</span>).<br /> 2277&#160; After this <em>initial processing</em>&#160;<span class="term">htmLawed()</span>&#160;identifies tags using regex and processes them with the help of <span class="term">hl_tag()</span>&#160;-- &#160;a large function that analyzes tag content, filtering it as per HTML standards, <span class="term">$config</span>&#160;and <span class="term">$spec</span>. Among other things, <span class="term">hl_tag()</span>&#160;transforms deprecated elements using <span class="term">hl_tag2()</span>, removes attributes from closing tags, checks attribute values as per <span class="term">$spec</span>&#160;rules using <span class="term">hl_attrval()</span>, and checks URL protocols using <span class="term">hl_prot()</span>. <span class="term">htmLawed()</span>&#160;performs tag balancing and nesting checks with a call to <span class="term">hl_bal()</span>, and optionally compacts/beautifies the output with proper white-spacing with a call to <span class="term">hl_tidy()</span>. The latter temporarily replaces white-space, and <span class="term">&lt;</span>, <span class="term">&gt;</span>&#160;and <span class="term">&amp;</span>&#160;characters inside <span class="term">pre</span>, <span class="term">script</span>&#160;and <span class="term">textarea</span>&#160;elements, and HTML comments and CDATA sections with control characters (code-points <span class="term">1</span>&#160;to <span class="term">5</span>, and <span class="term">7</span>).<br />
2278<br /> 2278<br />
2279&#160; htmLawed permits the use of custom code or <strong>hook functions</strong>&#160;at two stages. The first, called inside <span class="term">htmLawed()</span>, allows the input text as well as the finalized <span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;values to be altered right after the initial processing (see <a href="#s3.7">section 3.7</a>). The second is called by <span class="term">hl_tag()</span>&#160;once the tag content is finalized (see <a href="#s3.4.9">section 3.4.9</a>).<br /> 2279&#160; htmLawed permits the use of custom code or <strong>hook functions</strong>&#160;at two stages. The first, called inside <span class="term">htmLawed()</span>, allows the input text as well as the finalized <span class="term">$config</span>&#160;and <span class="term">$spec</span>&#160;values to be altered right after the initial processing (see <a href="#s3.7">section 3.7</a>). The second is called by <span class="term">hl_tag()</span>&#160;once the tag content is finalized (see <a href="#s3.4.9">section 3.4.9</a>).<br />
2280<br /> 2280<br />
2281&#160; The functionality of htmLawed is dictated by the external HTML standards. The code of htmLawed is thus written for a clear-cut aim, with not much concern for tweaking by other developers. The code is only minimally annotated with comments -- it is not meant to instruct. PHP developers familiar with the HTML specifications will see the logic, and others can always refer to the htmLawed documentation. 2281&#160; The functionality of htmLawed is dictated by the external HTML standards. The code of htmLawed is thus written for a clear-cut aim, with not much concern for tweaking by other developers. The code is only minimally annotated with comments -- it is not meant to instruct. PHP developers familiar with the HTML specifications will see the logic, and others can always refer to the htmLawed documentation.
2282</div> 2282</div>
2283</div> 2283</div>
2284<br /> 2284<br />
2285<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 25 Sep, 2019 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span> 2285<hr /><br /><br /><span class="subtle"><small>HTM version of <em><a href="htmLawed_README.txt">htmLawed_README.txt</a></em> generated on 25 Sep, 2019 using <a href="http://www.bioinformatics.org/phplabware/internal_utilities">rTxt2htm</a> from PHP Labware</small></span>
2286</div><!-- ended div body --> 2286</div><!-- ended div body -->
2287</div><!-- ended div top --> 2287</div><!-- ended div top -->
2288</body> 2288</body>
2289</html> \ No newline at end of file 2289</html> \ No newline at end of file
diff --git a/lib/htmlawed/htmLawed_README.txt b/lib/htmlawed/htmLawed_README.txt
index 7950500..824fe90 100755
--- a/lib/htmlawed/htmLawed_README.txt
+++ b/lib/htmlawed/htmLawed_README.txt
@@ -1,1817 +1,1817 @@
1/* 1/*
2htmLawed_README.txt, 24 September 2019 2htmLawed_README.txt, 24 September 2019
3htmLawed 1.2.5, 24 September 2019 3htmLawed 1.2.5, 24 September 2019
4Copyright Santosh Patnaik 4Copyright Santosh Patnaik
5Dual licensed with LGPL 3 and GPL 2+ 5Dual licensed with LGPL 3 and GPL 2+
6A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed 6A PHP Labware internal utility - http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed
7*/ 7*/
8 8
9 9
10== Content ========================================================= 10== Content =========================================================
11 11
12 12
131 About htmLawed 131 About htmLawed
14 1.1 Example uses 14 1.1 Example uses
15 1.2 Features 15 1.2 Features
16 1.3 History 16 1.3 History
17 1.4 License & copyright 17 1.4 License & copyright
18 1.5 Terms used here 18 1.5 Terms used here
19 1.6 Availability 19 1.6 Availability
202 Usage 202 Usage
21 2.1 Simple 21 2.1 Simple
22 2.2 Configuring htmLawed using the '$config' argument 22 2.2 Configuring htmLawed using the '$config' argument
23 2.3 Extra HTML specifications using the '$spec' argument 23 2.3 Extra HTML specifications using the '$spec' argument
24 2.4 Performance time & memory usage 24 2.4 Performance time & memory usage
25 2.5 Some security risks to keep in mind 25 2.5 Some security risks to keep in mind
26 2.6 Use with 'kses()' code 26 2.6 Use with 'kses()' code
27 2.7 Tolerance for ill-written HTML 27 2.7 Tolerance for ill-written HTML
28 2.8 Limitations & work-arounds 28 2.8 Limitations & work-arounds
29 2.9 Examples of usage 29 2.9 Examples of usage
303 Details 303 Details
31 3.1 Invalid/dangerous characters 31 3.1 Invalid/dangerous characters
32 3.2 Character references/entities 32 3.2 Character references/entities
33 3.3 HTML elements 33 3.3 HTML elements
34 3.3.1 HTML comments & 'CDATA' sections 34 3.3.1 HTML comments & 'CDATA' sections
35 3.3.2 Tag-transformation for better compliance with standards 35 3.3.2 Tag-transformation for better compliance with standards
36 3.3.3 Tag balancing & proper nesting 36 3.3.3 Tag balancing & proper nesting
37 3.3.4 Elements requiring child elements 37 3.3.4 Elements requiring child elements
38 3.3.5 Beautify or compact HTML 38 3.3.5 Beautify or compact HTML
39 3.4 Attributes 39 3.4 Attributes
40 3.4.1 Auto-addition of XHTML-required attributes 40 3.4.1 Auto-addition of XHTML-required attributes
41 3.4.2 Duplicate/invalid 'id' values 41 3.4.2 Duplicate/invalid 'id' values
42 3.4.3 URL schemes & scripts in attribute values 42 3.4.3 URL schemes & scripts in attribute values
43 3.4.4 Absolute & relative URLs 43 3.4.4 Absolute & relative URLs
44 3.4.5 Lower-cased, standard attribute values 44 3.4.5 Lower-cased, standard attribute values
45 3.4.6 Transformation of deprecated attributes 45 3.4.6 Transformation of deprecated attributes
46 3.4.7 Anti-spam & 'href' 46 3.4.7 Anti-spam & 'href'
47 3.4.8 Inline style properties 47 3.4.8 Inline style properties
48 3.4.9 Hook function for tag content 48 3.4.9 Hook function for tag content
49 3.5 Simple configuration directive for most valid XHTML 49 3.5 Simple configuration directive for most valid XHTML
50 3.6 Simple configuration directive for most `safe` HTML 50 3.6 Simple configuration directive for most `safe` HTML
51 3.7 Using a hook function 51 3.7 Using a hook function
52 3.8 Obtaining `finalized` parameter values 52 3.8 Obtaining `finalized` parameter values
53 3.9 Retaining non-HTML tags in input with mixed markup 53 3.9 Retaining non-HTML tags in input with mixed markup
544 Other 544 Other
55 4.1 Support 55 4.1 Support
56 4.2 Known issues 56 4.2 Known issues
57 4.3 Change-log 57 4.3 Change-log
58 4.4 Testing 58 4.4 Testing
59 4.5 Upgrade, & old versions 59 4.5 Upgrade, & old versions
60 4.6 Comparison with 'HTMLPurifier' 60 4.6 Comparison with 'HTMLPurifier'
61 4.7 Use through application plug-ins/modules 61 4.7 Use through application plug-ins/modules
62 4.8 Use in non-PHP applications 62 4.8 Use in non-PHP applications
63 4.9 Donate 63 4.9 Donate
64 4.10 Acknowledgements 64 4.10 Acknowledgements
655 Appendices 655 Appendices
66 5.1 Characters discouraged in HTML 66 5.1 Characters discouraged in HTML
67 5.2 Valid attribute-element combinations 67 5.2 Valid attribute-element combinations
68 5.3 CSS 2.1 properties accepting URLs 68 5.3 CSS 2.1 properties accepting URLs
69 5.4 Microsoft Windows 1252 character replacements 69 5.4 Microsoft Windows 1252 character replacements
70 5.5 URL format 70 5.5 URL format
71 5.6 Brief on htmLawed code 71 5.6 Brief on htmLawed code
72 72
73 73
74== 1 About htmLawed ================================================ 74== 1 About htmLawed ================================================
75 75
76 76
77 htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and with administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that introduces a security vulnerability or is used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such `lawing in` of HTML code ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators. 77 htmLawed is a PHP script to process text with HTML markup to make it more compliant with HTML standards and with administrative policies. It works by making HTML well-formed with balanced and properly nested tags, neutralizing code that introduces a security vulnerability or is used for cross-site scripting (XSS) attacks, allowing only specified HTML tags and attributes, and so on. Such `lawing in` of HTML code ensures that it is in accordance with the aesthetics, safety and usability requirements set by administrators.
78 78
79 htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file. It does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML Tidy:- http://tidy.sourceforge.net application. 79 htmLawed is highly customizable, and fast with low memory usage. Its free and open-source code is in one small file. It does not require extensions or libraries, and works in older versions of PHP as well. It is a good alternative to the HTML Tidy:- http://tidy.sourceforge.net application.
80 80
81 81
82-- 1.1 Example uses ------------------------------------------------ 82-- 1.1 Example uses ------------------------------------------------
83 83
84 84
85 * Filtering of text submitted as comments on blogs to allow only certain HTML elements 85 * Filtering of text submitted as comments on blogs to allow only certain HTML elements
86 86
87 * Making RSS newsfeed items standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant 87 * Making RSS newsfeed items standard-compliant: often one uses an excerpt from an HTML document for the content, and with unbalanced tags, non-numerical entities, etc., such excerpts may not be XML-compliant
88 88
89 * Beautifying or pretty-printing HTML code 89 * Beautifying or pretty-printing HTML code
90 90
91 * Text processing for stricter XML standard-compliance: e.g., to have lowercased 'x' in hexadecimal numeric entities becomes necessary if an HTML document with MathML content needs to be served as 'application/xml' 91 * Text processing for stricter XML standard-compliance: e.g., to have lowercased 'x' in hexadecimal numeric entities becomes necessary if an HTML document with MathML content needs to be served as 'application/xml'
92 92
93 * Scraping text from web-pages 93 * Scraping text from web-pages
94 94
95 * Transforming an HTML element to another 95 * Transforming an HTML element to another
96 96
97 97
98-- 1.2 Features ---------------------------------------------------o 98-- 1.2 Features ---------------------------------------------------o
99 99
100 100
101 Key: '*' security feature, '^' standard compliance, '~' requires setting right options 101 Key: '*' security feature, '^' standard compliance, '~' requires setting right options
102 102
103 htmLawed: 103 htmLawed:
104 104
105 * makes input more *secure* and *standard-compliant* for HTML as well as generic *XML* documents ^ 105 * makes input more *secure* and *standard-compliant* for HTML as well as generic *XML* documents ^
106 * supports markup for *HTML 5* and *microdata, ARIA, Ruby, custom attributes*, etc. ^ 106 * supports markup for *HTML 5* and *microdata, ARIA, Ruby, custom attributes*, etc. ^
107 * can *beautify* or *compact* HTML ~ 107 * can *beautify* or *compact* HTML ~
108 * works with input of almost any *character encoding* and does not affect it 108 * works with input of almost any *character encoding* and does not affect it
109 * has good *tolerance for ill-written HTML* 109 * has good *tolerance for ill-written HTML*
110 110
111 * can enforce *restricted use of elements* *~ 111 * can enforce *restricted use of elements* *~
112 * ensures proper closure of empty elements like 'img' ^ 112 * ensures proper closure of empty elements like 'img' ^
113 * *transforms deprecated elements* like 'font' ^~ 113 * *transforms deprecated elements* like 'font' ^~
114 * can permit HTML *comments* and *CDATA* sections ^~ 114 * can permit HTML *comments* and *CDATA* sections ^~
115 * can permit all elements, including 'script', 'object' and 'form' ~ 115 * can permit all elements, including 'script', 'object' and 'form' ~
116 116
117 * can *restrict attributes by element* ^~ 117 * can *restrict attributes by element* ^~
118 * removes *invalid attributes* ^ 118 * removes *invalid attributes* ^
119 * lower-cases element and attribute names ^ 119 * lower-cases element and attribute names ^
120 * provides *required attributes*, like 'alt' for 'image' ^ 120 * provides *required attributes*, like 'alt' for 'image' ^
121 * *transforms deprecated attributes* ^~ 121 * *transforms deprecated attributes* ^~
122 * ensures attributes are *declared only once* ^ 122 * ensures attributes are *declared only once* ^
123 * permits *custom*, non-standard attributes as well as custom rules for standard attributes ~ 123 * permits *custom*, non-standard attributes as well as custom rules for standard attributes ~
124 124
125 * declares value for `empty` (`minimized` or `boolean`) attributes like 'checked' ^ 125 * declares value for `empty` (`minimized` or `boolean`) attributes like 'checked' ^
126 * checks for potentially dangerous attribute values *~ 126 * checks for potentially dangerous attribute values *~
127 * ensures *unique* 'id' attribute values ^~ 127 * ensures *unique* 'id' attribute values ^~
128 * *double-quotes* attribute values ^ 128 * *double-quotes* attribute values ^
129 * lower-cases *standard attribute values* like 'password' ^ 129 * lower-cases *standard attribute values* like 'password' ^
130 130
131 * can restrict *URL protocol/scheme by attribute* *~ 131 * can restrict *URL protocol/scheme by attribute* *~
132 * can disable *dynamic expressions* in 'style' values *~ 132 * can disable *dynamic expressions* in 'style' values *~
133 133
134 * neutralizes invalid named *character entities* ^ 134 * neutralizes invalid named *character entities* ^
135 * converts hexadecimal numeric entities to decimal ones, or vice versa ^~ 135 * converts hexadecimal numeric entities to decimal ones, or vice versa ^~
136 * converts named entities to numeric ones for generic XML use ^~ 136 * converts named entities to numeric ones for generic XML use ^~
137 137
138 * removes *null* characters * 138 * removes *null* characters *
139 * neutralizes potentially dangerous proprietary Netscape *Javascript entities* * 139 * neutralizes potentially dangerous proprietary Netscape *Javascript entities* *
140 * replaces potentially dangerous *soft-hyphen* character in URL-accepting attribute values with spaces * 140 * replaces potentially dangerous *soft-hyphen* character in URL-accepting attribute values with spaces *
141 141
142 * removes common *invalid characters* not allowed in HTML or XML ^ 142 * removes common *invalid characters* not allowed in HTML or XML ^
143 * replaces *characters from Microsoft applications* like 'Word' that are discouraged in HTML or XML ^~ 143 * replaces *characters from Microsoft applications* like 'Word' that are discouraged in HTML or XML ^~
144 * neutralize entities for characters invalid or discouraged in HTML or XML ^ 144 * neutralize entities for characters invalid or discouraged in HTML or XML ^
145 * appropriately neutralize '<', '&', '"', and '>' characters ^* 145 * appropriately neutralize '<', '&', '"', and '>' characters ^*
146 146
147 * understands improperly spaced tag content (e.g., spread over more than a line) and properly spaces them 147 * understands improperly spaced tag content (e.g., spread over more than a line) and properly spaces them
148 * attempts to *balance tags* for well-formedness ^~ 148 * attempts to *balance tags* for well-formedness ^~
149 * understands when *omitable closing tags* like '</p>' are missing ^~ 149 * understands when *omitable closing tags* like '</p>' are missing ^~
150 * attempts to permit only *validly nested tags* ^~ 150 * attempts to permit only *validly nested tags* ^~
151 * can *either remove or neutralize bad content* ^~ 151 * can *either remove or neutralize bad content* ^~
152 * attempts to *rectify common errors of plain-text misplacement* (e.g., directly inside 'blockquote') ^~ 152 * attempts to *rectify common errors of plain-text misplacement* (e.g., directly inside 'blockquote') ^~
153 153
154 * has optional *anti-spam* measures such as addition of 'rel="nofollow"' and link-disabling ~ 154 * has optional *anti-spam* measures such as addition of 'rel="nofollow"' and link-disabling ~
155 * optionally makes *relative URLs absolute*, and vice versa ~ 155 * optionally makes *relative URLs absolute*, and vice versa ~
156 156
157 * optionally marks '&' to identify the entities for '&', '<' and '>' introduced by it ~ 157 * optionally marks '&' to identify the entities for '&', '<' and '>' introduced by it ~
158 158
159 * allows deployment of powerful *hook functions* to *inject* HTML, *consolidate* 'style' attributes to 'class', finely check attribute values, etc. ~ 159 * allows deployment of powerful *hook functions* to *inject* HTML, *consolidate* 'style' attributes to 'class', finely check attribute values, etc. ~
160 160
161 161
162-- 1.3 History ----------------------------------------------------o 162-- 1.3 History ----------------------------------------------------o
163 163
164 164
165 htmLawed was created in 2007 for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like 'HTML Tidy'. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the 'Kses' (version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses 'Kses'; see section:- #2.6.). Support for HTML version 5 was added in May 2013 in a beta and in February 2017 in a production release. 165 htmLawed was created in 2007 for use with 'LabWiki', a wiki software developed at PHP Labware, as a suitable software could not be found. Existing PHP software like 'Kses' and 'HTMLPurifier' were deemed inadequate, slow, resource-intensive, or dependent on an extension or external application like 'HTML Tidy'. The core logic of htmLawed, that of identifying HTML elements and attributes, was based on the 'Kses' (version 0.2.2) HTML filter software of Ulf Harnhammar (it can still be used with code that uses 'Kses'; see section:- #2.6.). Support for HTML version 5 was added in May 2013 in a beta and in February 2017 in a production release.
166 166
167 See section:- #4.3 for a detailed log of changes in htmLawed over the years, and section:- #4.10 for acknowledgements. 167 See section:- #4.3 for a detailed log of changes in htmLawed over the years, and section:- #4.10 for acknowledgements.
168 168
169 169
170-- 1.4 License & copyright ----------------------------------------o 170-- 1.4 License & copyright ----------------------------------------o
171 171
172 172
173 htmLawed is free and open-source software, copyrighted by Santosh Patnaik, MD, PhD, and dual-licensed with LGPL version 3:- http://www.gnu.org/licenses/lgpl-3.0.txt, and GPL version 2:- http://www.gnu.org/licenses/gpl-2.0.txt (or later) licenses. 173 htmLawed is free and open-source software, copyrighted by Santosh Patnaik, MD, PhD, and dual-licensed with LGPL version 3:- http://www.gnu.org/licenses/lgpl-3.0.txt, and GPL version 2:- http://www.gnu.org/licenses/gpl-2.0.txt (or later) licenses.
174 174
175 175
176-- 1.5 Terms used here --------------------------------------------o 176-- 1.5 Terms used here --------------------------------------------o
177 177
178 178
179 In this document, only HTML body-level elements are considered. htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes', and these elements are ignored here. 179 In this document, only HTML body-level elements are considered. htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes', and these elements are ignored here.
180 180
181 * `administrator` - or admin; person setting up the code that utilizes htmLawed; also, `user` 181 * `administrator` - or admin; person setting up the code that utilizes htmLawed; also, `user`
182 * `attributes` - name-value pairs like 'href="http://x.com"' in opening tags 182 * `attributes` - name-value pairs like 'href="http://x.com"' in opening tags
183 * `author` - see `writer` 183 * `author` - see `writer`
184 * `character` - atomic unit of text; internally represented by a numeric `code-point` as specified by the `encoding` or `charset` in use 184 * `character` - atomic unit of text; internally represented by a numeric `code-point` as specified by the `encoding` or `charset` in use
185 * `entity` - markup like '&gt;' and '&#160;' used to refer to a character 185 * `entity` - markup like '&gt;' and '&#160;' used to refer to a character
186 * `element` - HTML element like 'a' and 'img' 186 * `element` - HTML element like 'a' and 'img'
187 * `element content` - content between the opening and closing tags of an element, like 'click' of the '<a href="x">click</a>' element 187 * `element content` - content between the opening and closing tags of an element, like 'click' of the '<a href="x">click</a>' element
188 * `HTML` - implies XHTML unless specified otherwise 188 * `HTML` - implies XHTML unless specified otherwise
189 * `HTML body` - content in the `body` container of an HTML document 189 * `HTML body` - content in the `body` container of an HTML document
190 * `input` - text given to htmLawed to process 190 * `input` - text given to htmLawed to process
191 * `legal` – standard-compliant; also, `valid` 191 * `legal` – standard-compliant; also, `valid`
192 * `processing` - involves filtering, correction, etc., of input 192 * `processing` - involves filtering, correction, etc., of input
193 * `safe` - absence or reduction of certain characters and HTML elements and attributes in HTML of text that can otherwise potentially, and circumstantially, expose text readers to security vulnerabilities like cross-site scripting attacks (XSS) 193 * `safe` - absence or reduction of certain characters and HTML elements and attributes in HTML of text that can otherwise potentially, and circumstantially, expose text readers to security vulnerabilities like cross-site scripting attacks (XSS)
194 * `scheme` - a URL protocol like 'http' and 'ftp' 194 * `scheme` - a URL protocol like 'http' and 'ftp'
195 * `specification` - detailed description including rules that define HTML 195 * `specification` - detailed description including rules that define HTML
196 * `standard` – widely accepted specification 196 * `standard` – widely accepted specification
197 * `style property` - terms like 'border' and 'height' for which declarations are made in values for the 'style' attribute of elements 197 * `style property` - terms like 'border' and 'height' for which declarations are made in values for the 'style' attribute of elements
198 * `tag` - markers like '<a href="x">' and '</a>' delineating element content; the opening tag can contain attributes 198 * `tag` - markers like '<a href="x">' and '</a>' delineating element content; the opening tag can contain attributes
199 * `tag content` - consists of tag markers '<' and '>', element names like 'div', and possibly attributes 199 * `tag content` - consists of tag markers '<' and '>', element names like 'div', and possibly attributes
200 * `user` - administrator 200 * `user` - administrator
201 * `valid` - see `legal` 201 * `valid` - see `legal`
202 * `writer` - end-user like a blog commenter providing the input that is to be processed; also, `author` 202 * `writer` - end-user like a blog commenter providing the input that is to be processed; also, `author`
203 * `XHTML` - XML-compliant HTML; parsing rules for XHTML are more strict than for regular HTML 203 * `XHTML` - XML-compliant HTML; parsing rules for XHTML are more strict than for regular HTML
204 204
205 205
206-- 1.6 Availability -----------------------------------------------o 206-- 1.6 Availability -----------------------------------------------o
207 207
208 208
209 htmLawed can be downloaded for free at its website:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. Besides the 'htmLawed.php' file, the download has the htmLawed documentation (this document) in plain text:- htmLawed_README.txt and HTML:- htmLawed_README.htm formats, a script for testing:- htmLawedTest.php, and a text file for test-cases:- htmLawed_TESTCASE.txt. htmLawed is also available as a PHP class (OOP code) at its website. 209 htmLawed can be downloaded for free at its website:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. Besides the 'htmLawed.php' file, the download has the htmLawed documentation (this document) in plain text:- htmLawed_README.txt and HTML:- htmLawed_README.htm formats, a script for testing:- htmLawedTest.php, and a text file for test-cases:- htmLawed_TESTCASE.txt. htmLawed is also available as a PHP class (OOP code) at its website.
210 210
211 211
212== 2 Usage =======================================================oo 212== 2 Usage =======================================================oo
213 213
214 214
215 htmLawed works in PHP version 4.4 or higher. Either 'include()' the 'htmLawed.php' file, or copy-paste the entire code. 215 htmLawed works in PHP version 4.4 or higher. Either 'include()' the 'htmLawed.php' file, or copy-paste the entire code.
216 216
217 To use with PHP 4.3, have the following code included: 217 To use with PHP 4.3, have the following code included:
218 218
219 if(!function_exists('ctype_digit')){ 219 if(!function_exists('ctype_digit')){
220 function ctype_digit($var){ 220 function ctype_digit($var){
221 return ((int) $var == $var); 221 return ((int) $var == $var);
222 } 222 }
223 } 223 }
224 224
225 225
226-- 2.1 Simple ------------------------------------------------------ 226-- 2.1 Simple ------------------------------------------------------
227 227
228 228
229 The input text to be processed, '$text', is passed as an argument of type string; 'htmLawed()' returns the processed string: 229 The input text to be processed, '$text', is passed as an argument of type string; 'htmLawed()' returns the processed string:
230 230
231 $processed = htmLawed($text); 231 $processed = htmLawed($text);
232 232
233 With the 'htmLawed class' (section:- #1.6), usage is: 233 With the 'htmLawed class' (section:- #1.6), usage is:
234 234
235 $processed = htmLawed::hl($text); 235 $processed = htmLawed::hl($text);
236 236
237 *Notes*: (1) If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes'. 237 *Notes*: (1) If input is from a '$_GET' or '$_POST' value, and 'magic quotes' are enabled on the PHP setup, run 'stripslashes()' on the input before passing to htmLawed. (2) htmLawed does not have support for head-level elements, 'body', and the frame-level elements, 'frameset', 'frame' and 'noframes'.
238 238
239 By default, htmLawed will process the text allowing all valid HTML elements/tags and commonly used URL schemes and CSS style properties. It will allow Javascript code, 'CDATA' sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- '$config' and '$spec': 239 By default, htmLawed will process the text allowing all valid HTML elements/tags and commonly used URL schemes and CSS style properties. It will allow Javascript code, 'CDATA' sections and HTML comments, balance tags, and ensure proper nesting of elements. Such actions can be configured using two other optional arguments -- '$config' and '$spec':
240 240
241 $processed = htmLawed($text, $config, $spec); 241 $processed = htmLawed($text, $config, $spec);
242 242
243 The '$config' and '$spec' arguments are detailed below. Some examples are shown in section:- #2.9. For maximum protection against 'XSS' and other security vulnerabilities, consider using the 'safe' parameter; see section:- #3.6. 243 The '$config' and '$spec' arguments are detailed below. Some examples are shown in section:- #2.9. For maximum protection against 'XSS' and other security vulnerabilities, consider using the 'safe' parameter; see section:- #3.6.
244 244
245 245
246-- 2.2 Configuring htmLawed using the '$config' argument ---------o 246-- 2.2 Configuring htmLawed using the '$config' argument ---------o
247 247
248 248
249 '$config' instructs htmLawed on how to tackle certain tasks. When '$config' is not specified, or not set as an array (e.g., '$config = 1'), htmLawed will take default actions. One or many of the task-action or parameter-value pairs can be specified in '$config' as array key-value pairs. If a parameter is not specified, htmLawed will use the default value for it, indicated further below. In PHP code, parameter values that are integers should not be quoted and should be used as numeric types (unless meant as string/text). Thus, for instance: 249 '$config' instructs htmLawed on how to tackle certain tasks. When '$config' is not specified, or not set as an array (e.g., '$config = 1'), htmLawed will take default actions. One or many of the task-action or parameter-value pairs can be specified in '$config' as array key-value pairs. If a parameter is not specified, htmLawed will use the default value for it, indicated further below. In PHP code, parameter values that are integers should not be quoted and should be used as numeric types (unless meant as string/text). Thus, for instance:
250 250
251 $config = array('comment'=>0, 'cdata'=>1, 'elements'=>'a, b, strong'); 251 $config = array('comment'=>0, 'cdata'=>1, 'elements'=>'a, b, strong');
252 $processed = htmLawed($text, $config); 252 $processed = htmLawed($text, $config);
253 253
254 Below are the various parameters that can be specified in '$config'. 254 Below are the various parameters that can be specified in '$config'.
255 255
256 Key: '*' default, '^' different from htmLawed versions below 1.2, '~' different default when 'valid_xhtml' is set to '1' (see section:- #3.5), '"' different default when 'safe' is set to '1' (see section:- #3.6) 256 Key: '*' default, '^' different from htmLawed versions below 1.2, '~' different default when 'valid_xhtml' is set to '1' (see section:- #3.5), '"' different default when 'safe' is set to '1' (see section:- #3.6)
257 257
258 *abs_url* 258 *abs_url*
259 Make URLs absolute or relative; '$config["base_url"]' needs to be set; see section:- #3.4.4 259 Make URLs absolute or relative; '$config["base_url"]' needs to be set; see section:- #3.4.4
260 260
261 '-1' - make relative 261 '-1' - make relative
262 '0' - no action * 262 '0' - no action *
263 '1' - make absolute 263 '1' - make absolute
264 264
265 *and_mark* 265 *and_mark*
266 Mark '&' characters in the original input; see section:- #3.2 266 Mark '&' characters in the original input; see section:- #3.2
267 267
268 *anti_link_spam* 268 *anti_link_spam*
269 Anti-link-spam measure; see section:- #3.4.7 269 Anti-link-spam measure; see section:- #3.4.7
270 270
271 '0' - no measure taken * 271 '0' - no measure taken *
272 `array("regex1", "regex2")` - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more. 272 `array("regex1", "regex2")` - will ensure a 'rel' attribute with 'nofollow' in its value in case the 'href' attribute value matches the regular expression pattern 'regex1', and/or will remove 'href' if its value matches the regular expression pattern 'regex2'. E.g., 'array("/./", "/://\W*(?!(abc\.com|xyz\.org))/")'; see section:- #3.4.7 for more.
273 273
274 *anti_mail_spam* 274 *anti_mail_spam*
275 Anti-mail-spam measure; see section:- #3.4.7 275 Anti-mail-spam measure; see section:- #3.4.7
276 276
277 '0' - no measure taken * 277 '0' - no measure taken *
278 `word` - '@' in mail address in 'href' attribute value is replaced with specified `word` 278 `word` - '@' in mail address in 'href' attribute value is replaced with specified `word`
279 279
280 *balance* 280 *balance*
281 Balance tags for well-formedness and proper nesting; see section:- #3.3.3 281 Balance tags for well-formedness and proper nesting; see section:- #3.3.3
282 282
283 '0' - no 283 '0' - no
284 '1' - yes * 284 '1' - yes *
285 285
286 *base_url* 286 *base_url*
287 Base URL value that needs to be set if '$config["abs_url"]' is not '0'; see section:- #3.4.4 287 Base URL value that needs to be set if '$config["abs_url"]' is not '0'; see section:- #3.4.4
288 288
289 *cdata* 289 *cdata*
290 Handling of 'CDATA' sections; see section:- #3.3.1 290 Handling of 'CDATA' sections; see section:- #3.3.1
291 291
292 '0' - don't consider 'CDATA' sections as markup and proceed as if plain text " 292 '0' - don't consider 'CDATA' sections as markup and proceed as if plain text "
293 '1' - remove 293 '1' - remove
294 '2' - allow, but neutralize any '<', '>', and '&' inside by converting them to named entities 294 '2' - allow, but neutralize any '<', '>', and '&' inside by converting them to named entities
295 '3' - allow * 295 '3' - allow *
296 296
297 *clean_ms_char* 297 *clean_ms_char*
298 Replace `discouraged` characters introduced by Microsoft Word, etc.; see section:- #3.1 298 Replace `discouraged` characters introduced by Microsoft Word, etc.; see section:- #3.1
299 299
300 '0' - no * 300 '0' - no *
301 '1' - yes 301 '1' - yes
302 '2' - yes, but replace special single & double quotes with ordinary ones 302 '2' - yes, but replace special single & double quotes with ordinary ones
303 303
304 *comment* 304 *comment*
305 Handling of HTML comments; see section:- #3.3.1 305 Handling of HTML comments; see section:- #3.3.1
306 306
307 '0' - don't consider comments as markup and proceed as if plain text " 307 '0' - don't consider comments as markup and proceed as if plain text "
308 '1' - remove 308 '1' - remove
309 '2' - allow, but neutralize any '<', '>', and '&' inside by converting to named entities 309 '2' - allow, but neutralize any '<', '>', and '&' inside by converting to named entities
310 '3' - allow * 310 '3' - allow *
311 311
312 *css_expression* 312 *css_expression*
313 Allow dynamic CSS expression by not removing the expression from CSS property values in 'style' attributes; see section:- #3.4.8 313 Allow dynamic CSS expression by not removing the expression from CSS property values in 'style' attributes; see section:- #3.4.8
314 314
315 '0' - remove * 315 '0' - remove *
316 '1' - allow 316 '1' - allow
317 317
318 *deny_attribute* 318 *deny_attribute*
319 Denied HTML attributes; see section:- #3.4 319 Denied HTML attributes; see section:- #3.4
320 320
321 '0' - none * 321 '0' - none *
322 `string` - dictated by values in `string` 322 `string` - dictated by values in `string`
323 'on*' - on* event attributes like 'onfocus' not allowed " 323 'on*' - on* event attributes like 'onfocus' not allowed "
324 324
325 *direct_nest_list* 325 *direct_nest_list*
326 Allow direct nesting of a list within another without requiring it to be a list item; see section:- #3.3.4 326 Allow direct nesting of a list within another without requiring it to be a list item; see section:- #3.3.4
327 327
328 '0' - no * 328 '0' - no *
329 '1' - yes 329 '1' - yes
330 330
331 *elements* 331 *elements*
332 Allowed HTML elements; see section:- #3.3 332 Allowed HTML elements; see section:- #3.3
333 333
334 `all` - *^ 334 `all` - *^
335 '* -acronym -big -center -dir -font -isindex -s -strike -tt' - ~^ 335 '* -acronym -big -center -dir -font -isindex -s -strike -tt' - ~^
336 `applet, audio, canvas, embed, iframe, object, script, and video elements not allowed` - "^ 336 `applet, audio, canvas, embed, iframe, object, script, and video elements not allowed` - "^
337 337
338 *hexdec_entity* 338 *hexdec_entity*
339 Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see section:- #3.2 339 Allow hexadecimal numeric entities and do not convert to the more widely accepted decimal ones, or convert decimal to hexadecimal ones; see section:- #3.2
340 340
341 '0' - no 341 '0' - no
342 '1' - yes * 342 '1' - yes *
343 '2' - convert decimal to hexadecimal ones 343 '2' - convert decimal to hexadecimal ones
344 344
345 *hook* 345 *hook*
346 Name of an optional hook function to alter the input string, '$config' or '$spec' before htmLawed enters the main phase of its work; see section:- #3.7 346 Name of an optional hook function to alter the input string, '$config' or '$spec' before htmLawed enters the main phase of its work; see section:- #3.7
347 347
348 '0' - no hook function * 348 '0' - no hook function *
349 `name` - `name` is name of the hook function 349 `name` - `name` is name of the hook function
350 350
351 *hook_tag* 351 *hook_tag*
352 Name of an optional hook function to alter tag content finalized by htmLawed; see section:- #3.4.9 352 Name of an optional hook function to alter tag content finalized by htmLawed; see section:- #3.4.9
353 353
354 '0' - no hook function * 354 '0' - no hook function *
355 `name` - `name` is name of the hook function 355 `name` - `name` is name of the hook function
356 356
357 *keep_bad* 357 *keep_bad*
358 Neutralize `bad` tags by converting their '<' and '>' characters to entities, or remove them; see section:- #3.3.3 358 Neutralize `bad` tags by converting their '<' and '>' characters to entities, or remove them; see section:- #3.3.3
359 359
360 '0' - remove 360 '0' - remove
361 '1' - neutralize both tags and element content 361 '1' - neutralize both tags and element content
362 '2' - remove tags but neutralize element content 362 '2' - remove tags but neutralize element content
363 '3' and '4' - like '1' and '2' but remove if text ('pcdata') is invalid in parent element 363 '3' and '4' - like '1' and '2' but remove if text ('pcdata') is invalid in parent element
364 '5' and '6' * - like '3' and '4' but line-breaks, tabs and spaces are left 364 '5' and '6' * - like '3' and '4' but line-breaks, tabs and spaces are left
365 365
366 *lc_std_val* 366 *lc_std_val*
367 For XHTML compliance, predefined, standard attribute values, like 'get' for the 'method' attribute of 'form', must be lowercased; see section:- #3.4.5 367 For XHTML compliance, predefined, standard attribute values, like 'get' for the 'method' attribute of 'form', must be lowercased; see section:- #3.4.5
368 368
369 '0' - no 369 '0' - no
370 '1' - yes * 370 '1' - yes *
371 371
372 *make_tag_strict* 372 *make_tag_strict*
373 Transform or remove these deprecated HTML elements, even if they are allowed by the admin: acronym, applet, big, center, dir, font, isindex, s, strike, tt; see section:- #3.3.2 373 Transform or remove these deprecated HTML elements, even if they are allowed by the admin: acronym, applet, big, center, dir, font, isindex, s, strike, tt; see section:- #3.3.2
374 374
375 '0' - no 375 '0' - no
376 '1' - yes, but leave 'applet' and 'isindex' that currently cannot be transformed *^ 376 '1' - yes, but leave 'applet' and 'isindex' that currently cannot be transformed *^
377 '2' - yes, removing 'applet' and 'isindex' elements and their contents (nested elements remain) ~^ 377 '2' - yes, removing 'applet' and 'isindex' elements and their contents (nested elements remain) ~^
378 378
379 *named_entity* 379 *named_entity*
380 Allow non-universal named HTML entities, or convert to numeric ones; see section:- #3.2 380 Allow non-universal named HTML entities, or convert to numeric ones; see section:- #3.2
381 381
382 '0' - convert 382 '0' - convert
383 '1' - allow * 383 '1' - allow *
384 384
385 *no_deprecated_attr* 385 *no_deprecated_attr*
386 Allow deprecated attributes or transform them; see section:- #3.4.6 386 Allow deprecated attributes or transform them; see section:- #3.4.6
387 387
388 '0' - allow 388 '0' - allow
389 '1' - transform, but 'name' attributes for 'a' and 'map' are retained * 389 '1' - transform, but 'name' attributes for 'a' and 'map' are retained *
390 '2' - transform 390 '2' - transform
391 391
392 *parent* 392 *parent*
393 Name of the parent element, possibly imagined, that will hold the input; see section:- #3.3 393 Name of the parent element, possibly imagined, that will hold the input; see section:- #3.3
394 394
395 *safe* 395 *safe*
396 Magic parameter to make input the most secure against vulnerabilities like XSS without needing to specify other relevant '$config' parameters; see section:- #3.6 396 Magic parameter to make input the most secure against vulnerabilities like XSS without needing to specify other relevant '$config' parameters; see section:- #3.6
397 397
398 '0' - no * 398 '0' - no *
399 '1' - will auto-adjust other relevant '$config' parameters (indicated by '"' in this list) ^ 399 '1' - will auto-adjust other relevant '$config' parameters (indicated by '"' in this list) ^
400 400
401 *schemes* 401 *schemes*
402 Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs (or '!' to `deny` any URL); '*' covers all unspecified attributes; see section:- #3.4.3 402 Array of attribute-specific, comma-separated, lower-cased list of schemes (protocols) allowed in attributes accepting URLs (or '!' to `deny` any URL); '*' covers all unspecified attributes; see section:- #3.4.3
403 403
404 'href: aim, app, feed, file, ftp, gopher, http, https, javascript, irc, mailto, news, nntp, sftp, ssh, tel, telnet; *:data, file, http, https, javascript' *^ 404 'href: aim, app, feed, file, ftp, gopher, http, https, javascript, irc, mailto, news, nntp, sftp, ssh, tel, telnet; *:data, file, http, https, javascript' *^
405 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style: !; *:file, http, https' " 405 'href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style: !; *:file, http, https' "
406 406
407 *show_setting* 407 *show_setting*
408 Name of a PHP variable to assign the `finalized` '$config' and '$spec' values; see section:- #3.8 408 Name of a PHP variable to assign the `finalized` '$config' and '$spec' values; see section:- #3.8
409 409
410 *style_pass* 410 *style_pass*
411 Ignore 'style' attribute values, letting them through without any alteration 411 Ignore 'style' attribute values, letting them through without any alteration
412 412
413 '0' - no * 413 '0' - no *
414 '1' - htmLawed will let through any 'style' value; see section:- #3.4.8 414 '1' - htmLawed will let through any 'style' value; see section:- #3.4.8
415 415
416 *tidy* 416 *tidy*
417 Beautify or compact HTML code; see section:- #3.3.5 417 Beautify or compact HTML code; see section:- #3.3.5
418 418
419 '-1' - compact 419 '-1' - compact
420 '0' - no * 420 '0' - no *
421 '1' or `string` - beautify (custom format specified by 'string') 421 '1' or `string` - beautify (custom format specified by 'string')
422 422
423 *unique_ids* 423 *unique_ids*
424 'id' attribute value checks; see section:- #3.4.2 424 'id' attribute value checks; see section:- #3.4.2
425 425
426 '0' - no 426 '0' - no
427 '1' - remove duplicate and/or invalid ones * 427 '1' - remove duplicate and/or invalid ones *
428 `word` - remove invalid ones and replace duplicate ones with new and unique ones based on the `word`; the admin-specified `word` cannot contain a space character 428 `word` - remove invalid ones and replace duplicate ones with new and unique ones based on the `word`; the admin-specified `word` cannot contain a space character
429 429
430 *valid_xhtml* 430 *valid_xhtml*
431 Magic parameter to make input the most valid XHTML without needing to specify other relevant '$config' parameters; see section:- #3.5 431 Magic parameter to make input the most valid XHTML without needing to specify other relevant '$config' parameters; see section:- #3.5
432 432
433 '0' - no * 433 '0' - no *
434 '1' - will auto-adjust other relevant '$config' parameters (indicated by '~' in this list) 434 '1' - will auto-adjust other relevant '$config' parameters (indicated by '~' in this list)
435 435
436 *xml:lang* 436 *xml:lang*
437 Auto-add 'xml:lang' attribute; see section:- #3.4.1 437 Auto-add 'xml:lang' attribute; see section:- #3.4.1
438 438
439 '0' - no * 439 '0' - no *
440 '1' - add if 'lang' attribute is present 440 '1' - add if 'lang' attribute is present
441 '2' - add if 'lang' attribute is present, and remove 'lang' ~ 441 '2' - add if 'lang' attribute is present, and remove 'lang' ~
442 442
443 443
444-- 2.3 Extra HTML specifications using the $spec parameter --------o 444-- 2.3 Extra HTML specifications using the $spec parameter --------o
445 445
446 446
447 The '$spec' argument of htmLawed can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policies. '$spec' is specified as a string of text containing one or more `rules`, with multiple rules separated from each other by a semi-colon (';'). E.g., 447 The '$spec' argument of htmLawed can be used to disallow an otherwise legal attribute for an element, or to restrict the attribute's values. This can also be helpful as a security measure (e.g., in certain versions of browsers, certain values can cause buffer overflows and denial of service attacks), or in enforcing admin policies. '$spec' is specified as a string of text containing one or more `rules`, with multiple rules separated from each other by a semi-colon (';'). E.g.,
448 448
449 $spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'; 449 $spec = 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt';
450 $processed = htmLawed($text, $config, $spec); 450 $processed = htmLawed($text, $config, $spec);
451 451
452 Or, 452 Or,
453 453
454 $processed = htmLawed($text, $config, 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt'); 454 $processed = htmLawed($text, $config, 'i=-*; td, tr=style, id, -*; a=id(match="/[a-z][a-z\d.:\-`"]*/i"/minval=2), href(maxlen=100/minlen=34); img=-width,-alt');
455 455
456 A rule begins with an HTML *element* name(s) (`rule-element`), for which the rule applies, followed by an equal-to (=) sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., 'th,td,tr='. 456 A rule begins with an HTML *element* name(s) (`rule-element`), for which the rule applies, followed by an equal-to (=) sign. A rule-element may represent multiple elements if comma (,)-separated element names are used. E.g., 'th,td,tr='.
457 457
458 Rest of the rule consists of comma-separated HTML *attribute names*. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., '-width'. To deny all attributes, '-*' can be used. 458 Rest of the rule consists of comma-separated HTML *attribute names*. A minus (-) character before an attribute means that the attribute is not permitted inside the rule-element. E.g., '-width'. To deny all attributes, '-*' can be used.
459 459
460 Following shows examples of rule excerpts with rule-element 'a' and the attributes that are being permitted: 460 Following shows examples of rule excerpts with rule-element 'a' and the attributes that are being permitted:
461 461
462 * 'a=' - all 462 * 'a=' - all
463 * 'a=id' - all 463 * 'a=id' - all
464 * 'a=href, title, -id, -onclick' - all except 'id' and 'onclick' 464 * 'a=href, title, -id, -onclick' - all except 'id' and 'onclick'
465 * 'a=*, id, -id' - all except 'id' 465 * 'a=*, id, -id' - all except 'id'
466 * 'a=-*' - none 466 * 'a=-*' - none
467 * 'a=-*, href, title' - none except 'href' and 'title' 467 * 'a=-*, href, title' - none except 'href' and 'title'
468 * 'a=-*, -id, href, title' - none except 'href' and 'title' 468 * 'a=-*, -id, href, title' - none except 'href' and 'title'
469 469
470 Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in solidus (/)-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None or one or more of the following parameters may be specified: 470 Rules regarding *attribute values* are optionally specified inside round brackets after attribute names in solidus (/)-separated `parameter = value` pairs. E.g., 'title(maxlen=30/minlen=5)'. None or one or more of the following parameters may be specified:
471 471
472 * 'oneof' - one or more choices separated by '|' that the value should match; if only one choice is provided, then the value must match that choice; matching is case-sensitive 472 * 'oneof' - one or more choices separated by '|' that the value should match; if only one choice is provided, then the value must match that choice; matching is case-sensitive
473 473
474 * 'noneof' - one or more choices separated by '|' that the value should not match; matching is case-sensitive 474 * 'noneof' - one or more choices separated by '|' that the value should not match; matching is case-sensitive
475 475
476 * 'maxlen' and 'minlen' - upper and lower limits for the number of characters in the attribute value; specified in numbers 476 * 'maxlen' and 'minlen' - upper and lower limits for the number of characters in the attribute value; specified in numbers
477 477
478 * 'maxval' and 'minval' - upper and lower limits for the numerical value specified in the attribute value; specified in numbers 478 * 'maxval' and 'minval' - upper and lower limits for the numerical value specified in the attribute value; specified in numbers
479 479
480 * 'match' and 'nomatch' - pattern that the attribute value should or should not match; specified as PHP/PCRE-compatible regular expressions with delimiters and possibly modifiers (e.g., to specify case-sensitivity for matching) 480 * 'match' and 'nomatch' - pattern that the attribute value should or should not match; specified as PHP/PCRE-compatible regular expressions with delimiters and possibly modifiers (e.g., to specify case-sensitivity for matching)
481 481
482 * 'default' - a value to force on the attribute if the value provided by the writer does not fit any of the specified parameters 482 * 'default' - a value to force on the attribute if the value provided by the writer does not fit any of the specified parameters
483 483
484 If 'default' is not set and the attribute value does not satisfy any of the specified parameters, then the attribute is removed. The 'default' value can also be used to force all attribute declarations to take the same value (by getting the values declared illegal by setting, e.g., 'maxlen' to '-1'). 484 If 'default' is not set and the attribute value does not satisfy any of the specified parameters, then the attribute is removed. The 'default' value can also be used to force all attribute declarations to take the same value (by getting the values declared illegal by setting, e.g., 'maxlen' to '-1').
485 485
486 Examples with `input` '<input title="WIDTH" value="10em" /><input title="length" value="5" class="ic1 ic2" />' are shown below. 486 Examples with `input` '<input title="WIDTH" value="10em" /><input title="length" value="5" class="ic1 ic2" />' are shown below.
487 487
488 `Rule`: 'input=title(maxlen=60/minlen=6), value' 488 `Rule`: 'input=title(maxlen=60/minlen=6), value'
489 `Output`: '<input value="10em" /><input title="length" value="5" class="ic1 ic2" />' 489 `Output`: '<input value="10em" /><input title="length" value="5" class="ic1 ic2" />'
490 490
491 `Rule`: 'input=title(), value(maxval=8/default=6)' 491 `Rule`: 'input=title(), value(maxval=8/default=6)'
492 `Output`: '<input title="WIDTH" value="6" /><input title="length" value="5" class="ic1 ic2" />' 492 `Output`: '<input title="WIDTH" value="6" /><input title="length" value="5" class="ic1 ic2" />'
493 493
494 `Rule`: 'input=title(nomatch=%w.d%i), value(match=%em%/default=6em)' 494 `Rule`: 'input=title(nomatch=%w.d%i), value(match=%em%/default=6em)'
495 `Output`: '<input value="10em" /><input title="length" value="6em" class="ic1 ic2" />' 495 `Output`: '<input value="10em" /><input title="length" value="6em" class="ic1 ic2" />'
496 496
497 `Rule`: 'input=class(noneof=ic2|ic3/oneof=ic1|ic4), title(oneof=height|depth/default=depth), value(noneof=5|6)' 497 `Rule`: 'input=class(noneof=ic2|ic3/oneof=ic1|ic4), title(oneof=height|depth/default=depth), value(noneof=5|6)'
498 `Output`: '<input title="depth" value="10em" /><input title="depth" class="ic1" />' 498 `Output`: '<input title="depth" value="10em" /><input title="depth" class="ic1" />'
499 499
500 *Special characters*: The characters ';', ',', '/', '(', ')', '|', '~' and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be `escaped` by enclosing in pairs of double-quotes ('"'). A back-tick ('`') can be used to escape a literal '"'. An example rule illustrating this is 'input=value(maxlen=30/match="/^\w/"/default="your `"ID`"")'. 500 *Special characters*: The characters ';', ',', '/', '(', ')', '|', '~' and space have special meanings in the rules. Words in the rules that use such characters, or the characters themselves, should be `escaped` by enclosing in pairs of double-quotes ('"'). A back-tick ('`') can be used to escape a literal '"'. An example rule illustrating this is 'input=value(maxlen=30/match="/^\w/"/default="your `"ID`"")'.
501 501
502 *Attributes that accept multiple values*: If an attribute is 'accesskey', 'class', 'itemtype' or 'rel', which can have multiple, space-separated values, or 'srcset', which can have multiple, comma-separated values, htmLawed will parse the attribute value for such multiple values and will individually test each of them. 502 *Attributes that accept multiple values*: If an attribute is 'accesskey', 'class', 'itemtype' or 'rel', which can have multiple, space-separated values, or 'srcset', which can have multiple, comma-separated values, htmLawed will parse the attribute value for such multiple values and will individually test each of them.
503 503
504 *Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be possibly used to implement a functionality like that achieved using '$spec' functionality. 504 *Note*: To deny an attribute for all elements for which it is legal, '$config["deny_attribute"]' (see section:- #3.4) can be used instead of '$spec'. Also, attributes can be allowed element-specifically through '$spec' while being denied globally through '$config["deny_attribute"]'. The 'hook_tag' parameter (section:- #3.4.9) can also be possibly used to implement a functionality like that achieved using '$spec' functionality.
505 505
506 *Note*: Attributes' specifications for an element may be set through multiple rules. In case of conflict, the attribute specification in the first rule will get precedence. 506 *Note*: Attributes' specifications for an element may be set through multiple rules. In case of conflict, the attribute specification in the first rule will get precedence.
507 507
508 '$spec' can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of '$spec' will permit the custom uses of the standard 'rel' attribute in 'input' (not permitted as per standards) and of a non-standard attribute, 'vFlag', in 'img'. 508 '$spec' can also be used to permit custom, non-standard attributes as well as custom rules for standard attributes. Thus, the following value of '$spec' will permit the custom uses of the standard 'rel' attribute in 'input' (not permitted as per standards) and of a non-standard attribute, 'vFlag', in 'img'.
509 509
510 $spec = 'img=vFlag; input=rel' 510 $spec = 'img=vFlag; input=rel'
511 511
512 The attribute names must begin with an alphabet and cannot have space, equal-to (=) and solidus (/) characters. 512 The attribute names must begin with an alphabet and cannot have space, equal-to (=) and solidus (/) characters.
513 513
514 514
515-- 2.4 Performance time & memory usage ----------------------------o 515-- 2.4 Performance time & memory usage ----------------------------o
516 516
517 517
518 The time and memory consumed during text processing by htmLawed depends on its configuration, the size of the input, and the amount, nestedness and well-formedness of the HTML markup within the input. In particular, tag balancing and beautification each can increase the processing time by about a quarter. 518 The time and memory consumed during text processing by htmLawed depends on its configuration, the size of the input, and the amount, nestedness and well-formedness of the HTML markup within the input. In particular, tag balancing and beautification each can increase the processing time by about a quarter.
519 519
520 The htmLawed demo:- htmLawedTest.php can be used to evaluate the performance and effects of different types of input and '$config'. 520 The htmLawed demo:- htmLawedTest.php can be used to evaluate the performance and effects of different types of input and '$config'.
521 521
522 522
523-- 2.5 Some security risks to keep in mind ------------------------o 523-- 2.5 Some security risks to keep in mind ------------------------o
524 524
525 525
526 When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks: 526 When setting the parameters/arguments (like those to allow certain HTML elements) for use with htmLawed, one should bear in mind that the setting may let through potentially `dangerous` HTML code which is meant to steal user-data, deface a website, render a page non-functional, etc. Unless end-users, either people or software, supplying the content are completely trusted, security issues arising from the degree of HTML usage permitted through htmLawed's setting should be considered. For example, following increase security risks:
527 527
528 * Allowing 'script', 'applet', 'embed', 'iframe', 'canvas', 'audio', 'video' or 'object' elements, or certain of their attributes like 'allowscriptaccess' 528 * Allowing 'script', 'applet', 'embed', 'iframe', 'canvas', 'audio', 'video' or 'object' elements, or certain of their attributes like 'allowscriptaccess'
529 529
530 * Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., '<!--[if gte IE 4]><script>alert("xss");</script><![endif]-->' 530 * Allowing HTML comments (some Internet Explorer versions are vulnerable with, e.g., '<!--[if gte IE 4]><script>alert("xss");</script><![endif]-->'
531 531
532 * Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable) 532 * Allowing dynamic CSS expressions (some Internet Explorer versions are vulnerable)
533 533
534 * Allowing the 'style' attribute 534 * Allowing the 'style' attribute
535 535
536 To remove `unsecure` HTML, code-developers using htmLawed must set '$config' appropriately. E.g., '$config["elements"] = "* -script"' to deny the 'script' element (section:- #3.3), '$config["safe"] = 1' to auto-configure ceratin htmLawed parameters for maximizing security (section:- #3.6), etc. 536 To remove `unsecure` HTML, code-developers using htmLawed must set '$config' appropriately. E.g., '$config["elements"] = "* -script"' to deny the 'script' element (section:- #3.3), '$config["safe"] = 1' to auto-configure ceratin htmLawed parameters for maximizing security (section:- #3.6), etc.
537 537
538 Permitting the '*style*' attribute brings in risks of `click-jacking`, `phishing`, web-page overlays, etc., `even` when the 'safe' parameter is enabled (see section:- #3.6). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing 'style' completely and relying on CSS classes and stylesheet files is recommended. 538 Permitting the '*style*' attribute brings in risks of `click-jacking`, `phishing`, web-page overlays, etc., `even` when the 'safe' parameter is enabled (see section:- #3.6). Except for URLs and a few other things like CSS dynamic expressions, htmLawed currently does not check every CSS style property. It does provide ways for the code-developer implementing htmLawed to do such checks through htmLawed's '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing 'style' completely and relying on CSS classes and stylesheet files is recommended.
539 539
540 htmLawed does not check or correct the character *encoding* of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can allow for an exploit (like Google's `UTF-7/XSS` vulnerability of the past). 540 htmLawed does not check or correct the character *encoding* of the input it receives. In conjunction with permissive circumstances, such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can allow for an exploit (like Google's `UTF-7/XSS` vulnerability of the past).
541 541
542 Ocassionally, though very rarely, the default settings with which htmLawed runs may change between different versions of htmLawed. Admins should keep this in mind when upgrading htmLawed. Important changes in htmLawed's default behavior in new releases of the software are noted in section:- #4.5 on upgrades. 542 Ocassionally, though very rarely, the default settings with which htmLawed runs may change between different versions of htmLawed. Admins should keep this in mind when upgrading htmLawed. Important changes in htmLawed's default behavior in new releases of the software are noted in section:- #4.5 on upgrades.
543 543
544 544
545-- 2.6 Use with 'kses()' code -------------------------------------o 545-- 2.6 Use with 'kses()' code -------------------------------------o
546 546
547 547
548 The 'Kses' PHP script for HTML filtering is used by many applications (like 'WordPress', as in year 2012). It is possible to have such applications use htmLawed instead, since it is compatible with code that calls the 'kses()' function declared in the 'Kses' file (usually named 'kses.php'). E.g., application code like this will continue to work after replacing 'Kses' with htmLawed: 548 The 'Kses' PHP script for HTML filtering is used by many applications (like 'WordPress', as in year 2012). It is possible to have such applications use htmLawed instead, since it is compatible with code that calls the 'kses()' function declared in the 'Kses' file (usually named 'kses.php'). E.g., application code like this will continue to work after replacing 'Kses' with htmLawed:
549 549
550 $comment_filtered = kses($comment_input, array('a'=>array(), 'b'=>array(), 'i'=>array())); 550 $comment_filtered = kses($comment_input, array('a'=>array(), 'b'=>array(), 'i'=>array()));
551 551
552 If the application uses a 'Kses' file that has the 'kses()' function declared, then, to have the application use htmLawed instead of 'Kses', rename 'htmLawed.php' (to 'kses.php', e.g.) and replace the 'Kses' file (or just replace the code in the 'Kses' file with the htmLawed code). If the 'kses()' function in the 'Kses' file had been renamed by the application developer (e.g., in 'WordPress', it is named 'wp_kses()'), then appropriately rename the 'kses()' function in the htmLawed code. Then, add the following code (which was a part of htmLawed prior to version 1.2): 552 If the application uses a 'Kses' file that has the 'kses()' function declared, then, to have the application use htmLawed instead of 'Kses', rename 'htmLawed.php' (to 'kses.php', e.g.) and replace the 'Kses' file (or just replace the code in the 'Kses' file with the htmLawed code). If the 'kses()' function in the 'Kses' file had been renamed by the application developer (e.g., in 'WordPress', it is named 'wp_kses()'), then appropriately rename the 'kses()' function in the htmLawed code. Then, add the following code (which was a part of htmLawed prior to version 1.2):
553 553
554 // kses compatibility 554 // kses compatibility
555 function kses($t, $h, $p=array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'mailto')){ 555 function kses($t, $h, $p=array('http', 'https', 'ftp', 'news', 'nntp', 'telnet', 'gopher', 'mailto')){
556 foreach($h as $k=>$v){ 556 foreach($h as $k=>$v){
557 $h[$k]['n']['*'] = 1; 557 $h[$k]['n']['*'] = 1;
558 } 558 }
559 $C['cdata'] = $C['comment'] = $C['make_tag_strict'] = $C['no_deprecated_attr'] = $C['unique_ids'] = 0; 559 $C['cdata'] = $C['comment'] = $C['make_tag_strict'] = $C['no_deprecated_attr'] = $C['unique_ids'] = 0;
560 $C['keep_bad'] = 1; 560 $C['keep_bad'] = 1;
561 $C['elements'] = count($h) ? strtolower(implode(',', array_keys($h))) : '-*'; 561 $C['elements'] = count($h) ? strtolower(implode(',', array_keys($h))) : '-*';
562 $C['hook'] = 'kses_hook'; 562 $C['hook'] = 'kses_hook';
563 $C['schemes'] = '*:'. implode(',', $p); 563 $C['schemes'] = '*:'. implode(',', $p);
564 return htmLawed($t, $C, $h); 564 return htmLawed($t, $C, $h);
565 } 565 }
566 566
567 function kses_hook($t, &$C, &$S){ 567 function kses_hook($t, &$C, &$S){
568 return $t; 568 return $t;
569 } 569 }
570 570
571 If the 'Kses' file used by the application has been significantly altered by the application developers, then one may need a different approach. E.g., with 'WordPress' (as in the year 2012), it is best to copy the htmLawed code, along with the above-mentioned additions, to 'wp_includes/kses.php', rename the newly added function 'kses()' to 'wp_kses()', and delete the code for the original 'wp_kses()' function. 571 If the 'Kses' file used by the application has been significantly altered by the application developers, then one may need a different approach. E.g., with 'WordPress' (as in the year 2012), it is best to copy the htmLawed code, along with the above-mentioned additions, to 'wp_includes/kses.php', rename the newly added function 'kses()' to 'wp_kses()', and delete the code for the original 'wp_kses()' function.
572 572
573 If the 'Kses' code has a non-empty hook function (e.g., 'wp_kses_hook()' in case of 'WordPress'), then the code for htmLawed's 'kses_hook()' function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With 'WordPress', the hook function is an essential one. The following code is suggested for the htmLawed 'kses_hook()' in case of 'WordPress': 573 If the 'Kses' code has a non-empty hook function (e.g., 'wp_kses_hook()' in case of 'WordPress'), then the code for htmLawed's 'kses_hook()' function should be appropriately edited. However, the requirement of the hook function should be re-evaluated considering that htmLawed has extra capabilities. With 'WordPress', the hook function is an essential one. The following code is suggested for the htmLawed 'kses_hook()' in case of 'WordPress':
574 574
575 // kses compatibility 575 // kses compatibility
576 function kses_hook($string, &$cf, &$spec){ 576 function kses_hook($string, &$cf, &$spec){
577 $allowed_html = $spec; 577 $allowed_html = $spec;
578 $allowed_protocols = array(); 578 $allowed_protocols = array();
579 foreach($cf['schemes'] as $v){ 579 foreach($cf['schemes'] as $v){
580 foreach($v as $k2=>$v2){ 580 foreach($v as $k2=>$v2){
581 if(!in_array($k2, $allowed_protocols)){ 581 if(!in_array($k2, $allowed_protocols)){
582 $allowed_protocols[] = $k2; 582 $allowed_protocols[] = $k2;
583 } 583 }
584 } 584 }
585 } 585 }
586 return wp_kses_hook($string, $allowed_html, $allowed_protocols); 586 return wp_kses_hook($string, $allowed_html, $allowed_protocols);
587 } 587 }
588 588
589 589
590-- 2.7 Tolerance for ill-written HTML -----------------------------o 590-- 2.7 Tolerance for ill-written HTML -----------------------------o
591 591
592 592
593 htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be `read` as HTML, and may therefore get identified as mere plain text. Following statements indicate the degree of `looseness` that htmLawed can work with, and can be provided in instructions to writers: 593 htmLawed can work with ill-written HTML code in the input. However, HTML that is too ill-written may not be `read` as HTML, and may therefore get identified as mere plain text. Following statements indicate the degree of `looseness` that htmLawed can work with, and can be provided in instructions to writers:
594 594
595 * Tags must be flanked by '<' and '>' with no '>' inside -- any needed '>' should be put in as '&gt;'. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and '>', like '<div >' and '<img / >', but not after the '<'. 595 * Tags must be flanked by '<' and '>' with no '>' inside -- any needed '>' should be put in as '&gt;'. It is possible for tag content (element name and attributes) to be spread over many lines instead of being on one. A space may be present between the tag content and '>', like '<div >' and '<img / >', but not after the '<'.
596 596
597 * Element and attribute names need not be lower-cased. 597 * Element and attribute names need not be lower-cased.
598 598
599 * Attribute string of elements may be liberally spaced with tabs, line-breaks, etc. 599 * Attribute string of elements may be liberally spaced with tabs, line-breaks, etc.
600 600
601 * Attribute values may be single- and not double-quoted. 601 * Attribute values may be single- and not double-quoted.
602 602
603 * Left-padding of numeric entities (like, '&#0160;', '&x07ff;') with '0' is okay as long as the number of characters between between the '&' and the ';' does not exceed 8. All entities must end with ';' though. 603 * Left-padding of numeric entities (like, '&#0160;', '&x07ff;') with '0' is okay as long as the number of characters between between the '&' and the ';' does not exceed 8. All entities must end with ';' though.
604 604
605 * Named character entities must be properly cased. Thus, '&Lt;' or '&TILDE;' will not be recognized as entities and will be `neutralized`. 605 * Named character entities must be properly cased. Thus, '&Lt;' or '&TILDE;' will not be recognized as entities and will be `neutralized`.
606 606
607 * HTML comments should not be inside element tags (they can be between tags), and should begin with '<!--' and end with '-->'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any '-->' inside should be put in as '--&gt;'. Any '--' inside will be automatically converted to '-', and a space will be added before the '-->' comment-closing marker unless '$config["comments"]' is set to '4' (section:- #3.3.1). 607 * HTML comments should not be inside element tags (they can be between tags), and should begin with '<!--' and end with '-->'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any '-->' inside should be put in as '--&gt;'. Any '--' inside will be automatically converted to '-', and a space will be added before the '-->' comment-closing marker unless '$config["comments"]' is set to '4' (section:- #3.3.1).
608 608
609 * 'CDATA' sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with '<[CDATA[' and end with ']]>'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any ']]>' inside should be put in as ']]&gt;'. 609 * 'CDATA' sections should not be inside element tags, and can be in element content only if plain text is allowed for that element. They should begin with '<[CDATA[' and end with ']]>'. Characters like '<', '>', and '&' may be allowed inside depending on '$config', but any ']]>' inside should be put in as ']]&gt;'.
610 610
611 * For attribute values, character entities '&lt;', '&gt;' and '&amp;' should be used instead of characters '<' and '>', and '&' (when '&' is not part of a character entity). This applies even for Javascript code in values of attributes like 'onclick'. 611 * For attribute values, character entities '&lt;', '&gt;' and '&amp;' should be used instead of characters '<' and '>', and '&' (when '&' is not part of a character entity). This applies even for Javascript code in values of attributes like 'onclick'.
612 612
613 * Characters '<', '>', '&' and '"' that are part of actual Javascript, etc., code in 'script' elements should be used as such and not be put in as entities like '&gt;'. Otherwise, though the HTML will be valid, the code may fail to work. Further, if such characters have to be used, then they should be put inside 'CDATA' sections. 613 * Characters '<', '>', '&' and '"' that are part of actual Javascript, etc., code in 'script' elements should be used as such and not be put in as entities like '&gt;'. Otherwise, though the HTML will be valid, the code may fail to work. Further, if such characters have to be used, then they should be put inside 'CDATA' sections.
614 614
615 * Simple instructions like "an opening tag cannot be present between two closing tags" and "nested elements should be closed in the reverse order of how they were opened" can help authors write balanced HTML. If tags are imbalanced, htmLawed will try to balance them, but in the process, depending on '$config["keep_bad"]', some code/text may be lost. 615 * Simple instructions like "an opening tag cannot be present between two closing tags" and "nested elements should be closed in the reverse order of how they were opened" can help authors write balanced HTML. If tags are imbalanced, htmLawed will try to balance them, but in the process, depending on '$config["keep_bad"]', some code/text may be lost.
616 616
617 * Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc. 617 * Input authors should be notified of admin-specified allowed elements, attributes, configuration values (like conversion of named entities to numeric ones), etc.
618 618
619 * With '$config["unique_ids"]' not '0' and the 'id' attribute being permitted, writers should carefully avoid using duplicate or invalid 'id' values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when '<a id="home"></a><input id="home" /><label for="home"></label>' is processed into 619 * With '$config["unique_ids"]' not '0' and the 'id' attribute being permitted, writers should carefully avoid using duplicate or invalid 'id' values as even though htmLawed will correct/remove the values, the final output may not be the one desired. E.g., when '<a id="home"></a><input id="home" /><label for="home"></label>' is processed into
620'<a id="home"></a><input id="prefix_home" /><label for="home"></label>'. 620'<a id="home"></a><input id="prefix_home" /><label for="home"></label>'.
621 621
622 * Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant. 622 * Even if intended HTML is lost from an ill-written input, the processed output will be more secure and standard-compliant.
623 623
624 * For URLs, unless '$config["scheme"]' is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., 'htt&#112;' (which many browsers will read as the harmless 'http') may be considered bad by htmLawed. 624 * For URLs, unless '$config["scheme"]' is appropriately set, writers should avoid using escape characters or entities in schemes. E.g., 'htt&#112;' (which many browsers will read as the harmless 'http') may be considered bad by htmLawed.
625 625
626 * htmLawed will attempt to put plain text present directly inside 'blockquote', 'form', 'map' and 'noscript' elements (illegal as per the specifications) inside auto-generated 'div' elements during tag balancing (section:- #3.3.3). 626 * htmLawed will attempt to put plain text present directly inside 'blockquote', 'form', 'map' and 'noscript' elements (illegal as per the specifications) inside auto-generated 'div' elements during tag balancing (section:- #3.3.3).
627 627
628 628
629-- 2.8 Limitations & work-arounds ---------------------------------o 629-- 2.8 Limitations & work-arounds ---------------------------------o
630 630
631 631
632 htmLawed's main objective is to make the input text `more` standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with possible work-arounds. 632 htmLawed's main objective is to make the input text `more` standard-compliant, secure for readers, and free of HTML elements and attributes considered undesirable by the administrator. Some of its current limitations, regardless of this objective, are noted below along with possible work-arounds.
633 633
634 It should be borne in mind that no browser application is 100% standard-compliant, standard specifications continue to evolve, and many browsers accept commonly used non-standard HTML. Regarding security, note that `unsafe` HTML code is not legally invalid per se. 634 It should be borne in mind that no browser application is 100% standard-compliant, standard specifications continue to evolve, and many browsers accept commonly used non-standard HTML. Regarding security, note that `unsafe` HTML code is not legally invalid per se.
635 635
636 * By default, htmLawed will not strictly adhere to the `current` HTML standard. Admins can configure htmLawed to be more strict about standard compliance. Standard specification for HTML is continuously evolving. There are two bodies (W3C:- http://www.w3c.org and WHATWG:- http://www.whatwg.org) that specify the standard and their specifications are not identical. E.g., as in mid-2013, the 'border' attribute is valid in 'table' as per W3C but not WHATWG. Thus, htmLawed may not be fully compliant with the standard of a specific group. The HTML standards/rules that htmLawed uses in its logic are a mix of the W3C and WHATWG standards, and can be lax because of the laxity of HTML interpreters (browsers) regarding standards. 636 * By default, htmLawed will not strictly adhere to the `current` HTML standard. Admins can configure htmLawed to be more strict about standard compliance. Standard specification for HTML is continuously evolving. There are two bodies (W3C:- http://www.w3c.org and WHATWG:- http://www.whatwg.org) that specify the standard and their specifications are not identical. E.g., as in mid-2013, the 'border' attribute is valid in 'table' as per W3C but not WHATWG. Thus, htmLawed may not be fully compliant with the standard of a specific group. The HTML standards/rules that htmLawed uses in its logic are a mix of the W3C and WHATWG standards, and can be lax because of the laxity of HTML interpreters (browsers) regarding standards.
637 637
638 * In general, htmLawed processes input to generate output that is most likely to be standard-compatible in most users' browsers. Thus, for example, it does not enforce the required value of '0' on 'border' attribute of 'img' (an HTML version 5 specification). 638 * In general, htmLawed processes input to generate output that is most likely to be standard-compatible in most users' browsers. Thus, for example, it does not enforce the required value of '0' on 'border' attribute of 'img' (an HTML version 5 specification).
639 639
640 * htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frame-specific elements 'frameset', 'frame' and 'noframes'. However, content of the latter elements can be individually filtered through htmLawed. 640 * htmLawed is meant for input that goes into the 'body' of HTML documents. HTML's head-level elements are not supported, nor are the frame-specific elements 'frameset', 'frame' and 'noframes'. However, content of the latter elements can be individually filtered through htmLawed.
641 641
642 * It cannot handle input that has non-HTML code like 'SVG' and 'MathML'. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in section:- #3.9. A third way may be to some how take advantage of the '$config["and_mark"]' parameter (see section:- #3.2). 642 * It cannot handle input that has non-HTML code like 'SVG' and 'MathML'. One way around is to break the input into pieces and passing only those without non-HTML code to htmLawed. Another is described in section:- #3.9. A third way may be to some how take advantage of the '$config["and_mark"]' parameter (see section:- #3.2).
643 643
644 * By default, htmLawed won't check many attribute values for standard compliance. E.g., 'width="20m"' with the dimension in non-standard 'm' is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the 'hook_tag' parameter (section:- #3.4.9) or '$spec' to enforce finer checks on attribute values. 644 * By default, htmLawed won't check many attribute values for standard compliance. E.g., 'width="20m"' with the dimension in non-standard 'm' is let through. Implementing universal and strict attribute value checks can make htmLawed slow and resource-intensive. Admins should look at the 'hook_tag' parameter (section:- #3.4.9) or '$spec' to enforce finer checks on attribute values.
645 645
646 * By default, htmLawed considers all ARIA, data-*, event and microdata attributes as global attributes and permits them in all elements. This is not strictly standard-compliant. E.g., the 'itemtype' microdata attribute is permitted only in elements that also have the 'itemscope' attribute. Admins can configure htmLawed to be more strict about this (section:- #2.3). 646 * By default, htmLawed considers all ARIA, data-*, event and microdata attributes as global attributes and permits them in all elements. This is not strictly standard-compliant. E.g., the 'itemtype' microdata attribute is permitted only in elements that also have the 'itemscope' attribute. Admins can configure htmLawed to be more strict about this (section:- #2.3).
647 647
648 * The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. However, '$spec' can be used to allow custom attributes (section:- #2.3). 648 * The attributes, deprecated (which can be transformed too) or not, that it supports are largely those that are in the specifications. Only a few of the proprietary attributes are supported. However, '$spec' can be used to allow custom attributes (section:- #2.3).
649 649
650 * Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the 'hook_tag' parameter (section:- #3.4.9) or '$spec' for finer checks. Perhaps the best option is to disallow 'style' but allow 'class' attributes with the right 'oneof' or 'match' values for 'class', and have the various class style properties in '.css' CSS stylesheet files. 650 * Except for contained URLs and dynamic expressions (also optional), htmLawed does not check CSS style property values. Admins should look at using the 'hook_tag' parameter (section:- #3.4.9) or '$spec' for finer checks. Perhaps the best option is to disallow 'style' but allow 'class' attributes with the right 'oneof' or 'match' values for 'class', and have the various class style properties in '.css' CSS stylesheet files.
651 651
652 * htmLawed does not parse emoticons, decode `BBcode`, or `wikify`, auto-converting text to proper HTML. Similarly, it won't convert line-breaks to 'br' elements. Such functions are beyond its purview. Admins should use other code to pre- or post-process the input for such purposes. 652 * htmLawed does not parse emoticons, decode `BBcode`, or `wikify`, auto-converting text to proper HTML. Similarly, it won't convert line-breaks to 'br' elements. Such functions are beyond its purview. Admins should use other code to pre- or post-process the input for such purposes.
653 653
654 * htmLawed cannot be used to have links force-opened in new windows (by auto-adding appropriate 'target' and 'onclick' attributes to 'a'). Admins should look at Javascript-based DOM-modifying solutions for this. Admins may also be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). 654 * htmLawed cannot be used to have links force-opened in new windows (by auto-adding appropriate 'target' and 'onclick' attributes to 'a'). Admins should look at Javascript-based DOM-modifying solutions for this. Admins may also be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
655 655
656 * Nesting-based checks are not possible. E.g., one cannot disallow 'p' elements specifically inside 'td' while permitting it elsewhere. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). 656 * Nesting-based checks are not possible. E.g., one cannot disallow 'p' elements specifically inside 'td' while permitting it elsewhere. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
657 657
658 * Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert 'http' to 'https'. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). 658 * Except for optionally converting absolute or relative URLs to the other type, htmLawed will not alter URLs (e.g., to change the value of query strings or to convert 'http' to 'https'. Having absolute URLs may be a standard-requirement, e.g., when HTML is embedded in email messages, whereas altering URLs for other purposes is beyond htmLawed's goals. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
659 659
660 * Pairs of opening and closing tags that do not enclose any content (like '<em></em>') are not removed. This may be against the standard specification for certain elements (e.g., 'table'). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code. 660 * Pairs of opening and closing tags that do not enclose any content (like '<em></em>') are not removed. This may be against the standard specification for certain elements (e.g., 'table'). However, presence of such standard-incompliant code will not break the display or layout of content. Admins can also use simple regex-based code to filter out such code.
661 661
662 * htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). 662 * htmLawed does not check for certain element orderings described in the standard specifications (e.g., in a 'table', 'tbody' is allowed before 'tfoot'). Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
663 663
664 * htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per standard specifications. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9). 664 * htmLawed does not check the number of nested elements. E.g., it will allow two 'caption' elements in a 'table' element, illegal as per standard specifications. Admins may be able to use a custom hook function to enforce such checks ('hook_tag' parameter; see section:- #3.4.9).
665 665
666 * There are multiple ways to interpret ill-written HTML. E.g., in '<small><small>text</small>', is it that the second closing tag for 'small' is missing or is it that the second opening tag for 'small' was put in by mistake? htmLawed corrects the HTML in the string assuming the former, while the user may have intended the string for the latter. This is an issue that is impossible to address perfectly. 666 * There are multiple ways to interpret ill-written HTML. E.g., in '<small><small>text</small>', is it that the second closing tag for 'small' is missing or is it that the second opening tag for 'small' was put in by mistake? htmLawed corrects the HTML in the string assuming the former, while the user may have intended the string for the latter. This is an issue that is impossible to address perfectly.
667 667
668 * htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers ('/*') in 'style' attribute values in order to detect malicious HTML like crafted, Internet Explorer browser-specific dynamic expressions like '&#101;xpression...'. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the 'hook_tag' parameter (section:- #3.4.9) to more specifically identify CSS expressions in the 'style' attribute values. Also, using '$config["style_pass"]', it is possible to have htmLawed pass 'style' attribute values without even looking at them (section:- #3.4.8). 668 * htmLawed might convert certain entities to actual characters and remove backslashes and CSS comment-markers ('/*') in 'style' attribute values in order to detect malicious HTML like crafted, Internet Explorer browser-specific dynamic expressions like '&#101;xpression...'. If this is too harsh, admins can allow CSS expressions through htmLawed core but then use a custom function through the 'hook_tag' parameter (section:- #3.4.9) to more specifically identify CSS expressions in the 'style' attribute values. Also, using '$config["style_pass"]', it is possible to have htmLawed pass 'style' attribute values without even looking at them (section:- #3.4.8).
669 669
670 * htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., '<a href="http://x%22+style=%22background-image:xss">x</a>'). These arise when browsers mis-identify markup in `escaped` text, defeating the very purpose of escaping text (a bad browser will read the given example as '<a href="http://x" style="background-image:xss">x</a>'). 670 * htmLawed does not correct certain possible attribute-based security vulnerabilities (e.g., '<a href="http://x%22+style=%22background-image:xss">x</a>'). These arise when browsers mis-identify markup in `escaped` text, defeating the very purpose of escaping text (a bad browser will read the given example as '<a href="http://x" style="background-image:xss">x</a>').
671 671
672 * Because of poor Unicode support in PHP, htmLawed does not remove the `high value` HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section:- #3.1). 672 * Because of poor Unicode support in PHP, htmLawed does not remove the `high value` HTML-invalid characters with multi-byte code-points. Such characters however are extremely unlikely to be in the input. (see section:- #3.1).
673 673
674 * htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can permit an exploit (like Google's `UTF-7/XSS` vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect. 674 * htmLawed does not check or correct the character encoding of the input it receives. In conjunction with permitting circumstances such as when the character encoding is left undefined through HTTP headers or HTML 'meta' tags, this can permit an exploit (like Google's `UTF-7/XSS` vulnerability of the past). Also, htmLawed can mangle input text if it is not well-formed in terms of character encoding. Administrators can consider using code available elsewhere to check well-formedness of input text characters to correct any defect.
675 675
676 * htmLawed is expected to work with input texts in ASCII standard-compatible single-byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable-byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double-byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple-byte encoding. If an input text has such an encoding, administrators can use PHP's iconv:- http://php.net/manual/en/book.iconv.php functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed. 676 * htmLawed is expected to work with input texts in ASCII standard-compatible single-byte encodings such as national variants of ASCII (like ISO-646-DE/German of the ISO 646 standard), extended ASCII variants (like ISO 8859-10/Turkish of the ISO 8859/ISO Latin standard), ISO 8859-based Windows variants (like Windows 1252), EBCDIC, Shift JIS (Japanese), GB-Roman (Chinese), and KS-Roman (Korean). It should also properly handle texts with variable-byte encodings like UTF-7 (Unicode) and UTF-8 (Unicode). However, htmLawed may mangle input texts with double-byte encodings like UTF-16 (Unicode), JIS X 0208:1997 (Japanese) and K SX 1001:1992 (Korean), or the UTF-32 (Unicode) quadruple-byte encoding. If an input text has such an encoding, administrators can use PHP's iconv:- http://php.net/manual/en/book.iconv.php functions, or some other mean, to convert text to UTF-8 before passing it to htmLawed.
677 677
678 * Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts. 678 * Like any script using PHP's PCRE regex functions, PHP setup-specific low PCRE limit values can cause htmLawed to at least partially fail with very long input texts.
679 679
680 680
681-- 2.9 Examples of usage ------------------------------------------o 681-- 2.9 Examples of usage ------------------------------------------o
682 682
683 683
684 Safest, allowing only `safe` HTML markup -- 684 Safest, allowing only `safe` HTML markup --
685 685
686 $config = array('safe'=>1); 686 $config = array('safe'=>1);
687 $out = htmLawed($in, $config); 687 $out = htmLawed($in, $config);
688 688
689 Simplest, allowing all valid HTML markup including Javascript -- 689 Simplest, allowing all valid HTML markup including Javascript --
690 690
691 $out = htmLawed($in); 691 $out = htmLawed($in);
692 692
693 Allowing all valid HTML markup but restricting URL schemes in 'src' attribute values to 'http' and 'https' -- 693 Allowing all valid HTML markup but restricting URL schemes in 'src' attribute values to 'http' and 'https' --
694 694
695 $config = array('schemes'=>'*:*; src:http, https'); 695 $config = array('schemes'=>'*:*; src:http, https');
696 $out = htmLawed($in, $config); 696 $out = htmLawed($in, $config);
697 697
698 Allowing only 'safe' HTML and the elements 'a', 'em', and 'strong' -- 698 Allowing only 'safe' HTML and the elements 'a', 'em', and 'strong' --
699 699
700 $config = array('safe'=>1, 'elements'=>'a, em, strong'); 700 $config = array('safe'=>1, 'elements'=>'a, em, strong');
701 $out = htmLawed($in, $config); 701 $out = htmLawed($in, $config);
702 702
703 Not allowing elements 'script' and 'object' -- 703 Not allowing elements 'script' and 'object' --
704 704
705 $config = array('elements'=>'* -script -object'); 705 $config = array('elements'=>'* -script -object');
706 $out = htmLawed($in, $config); 706 $out = htmLawed($in, $config);
707 707
708 Not allowing attributes 'id' and 'style' -- 708 Not allowing attributes 'id' and 'style' --
709 709
710 $config = array('deny_attribute'=>'id, style'); 710 $config = array('deny_attribute'=>'id, style');
711 $out = htmLawed($in, $config); 711 $out = htmLawed($in, $config);
712 712
713 Permitting only attributes 'title' and 'href' -- 713 Permitting only attributes 'title' and 'href' --
714 714
715 $config = array('deny_attribute'=>'* -title -href'); 715 $config = array('deny_attribute'=>'* -title -href');
716 $out = htmLawed($in, $config); 716 $out = htmLawed($in, $config);
717 717
718 Remove bad/disallowed tags altogether instead of converting them to entities -- 718 Remove bad/disallowed tags altogether instead of converting them to entities --
719 719
720 $config = array('keep_bad'=>0); 720 $config = array('keep_bad'=>0);
721 $out = htmLawed($in, $config); 721 $out = htmLawed($in, $config);
722 722
723 Allowing attribute 'title' only in 'a' and not allowing attributes 'id', 'style', or scriptable `on*` attributes like 'onclick' -- 723 Allowing attribute 'title' only in 'a' and not allowing attributes 'id', 'style', or scriptable `on*` attributes like 'onclick' --
724 724
725 $config = array('deny_attribute'=>'title, id, style, on*'); 725 $config = array('deny_attribute'=>'title, id, style, on*');
726 $spec = 'a=title'; 726 $spec = 'a=title';
727 $out = htmLawed($in, $config, $spec); 727 $out = htmLawed($in, $config, $spec);
728 728
729 Allowing a custom attribute, 'vFlag', in 'img' and permitting custom use of the standard attribute, 'rel', in 'input' -- 729 Allowing a custom attribute, 'vFlag', in 'img' and permitting custom use of the standard attribute, 'rel', in 'input' --
730 730
731 $spec = 'img=vFlag; input=rel'; 731 $spec = 'img=vFlag; input=rel';
732 $out = htmLawed($in, $config, $spec); 732 $out = htmLawed($in, $config, $spec);
733 733
734 Some case-studies are presented below. 734 Some case-studies are presented below.
735 735
736 *1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to point only to 'http' or 'https' resources: 736 *1.* A blog administrator wants to allow only 'a', 'em', 'strike', 'strong' and 'u' in comments, but needs 'strike' and 'u' transformed to 'span' for better XHTML 1-strict compliance, and, he wants the 'a' links to point only to 'http' or 'https' resources:
737 737
738 $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href'); 738 $processed = htmLawed($in, array('elements'=>'a, em, strike, strong, u', 'make_tag_strict'=>1, 'safe'=>1, 'schemes'=>'*:http, https'), 'a=href');
739 739
740 *2.* An author uses a custom-made web application to load content on his website. He is the only one using that application and the content he generates has all types of HTML, including scripts. The web application uses htmLawed primarily as a tool to correct errors that creep in while writing HTML and to take care of the occasional `bad` characters in copy-paste text introduced by Microsoft Office. The web application provides a preview before submitted input is added to the content. For the previewing process, htmLawed is set up as follows: 740 *2.* An author uses a custom-made web application to load content on his website. He is the only one using that application and the content he generates has all types of HTML, including scripts. The web application uses htmLawed primarily as a tool to correct errors that creep in while writing HTML and to take care of the occasional `bad` characters in copy-paste text introduced by Microsoft Office. The web application provides a preview before submitted input is added to the content. For the previewing process, htmLawed is set up as follows:
741 741
742 $processed = htmLawed($in, array('css_expression'=>1, 'keep_bad'=>1, 'make_tag_strict'=>1, 'schemes'=>'*:*', 'valid_xhtml'=>1)); 742 $processed = htmLawed($in, array('css_expression'=>1, 'keep_bad'=>1, 'make_tag_strict'=>1, 'schemes'=>'*:*', 'valid_xhtml'=>1));
743 743
744 For the final submission process, 'keep_bad' is set to '6'. A value of '1' for the preview process allows the author to note and correct any HTML mistake without losing any of the typed text. 744 For the final submission process, 'keep_bad' is set to '6'. A value of '1' for the preview process allows the author to note and correct any HTML mistake without losing any of the typed text.
745 745
746 *3.* A data-miner is scraping information in a specific table of similar web-pages and is collating the data rows, and uses htmLawed to reduce unnecessary markup and white-spaces: 746 *3.* A data-miner is scraping information in a specific table of similar web-pages and is collating the data rows, and uses htmLawed to reduce unnecessary markup and white-spaces:
747 747
748 $processed = htmLawed($in, array('elements'=>'tr, td', 'tidy'=>-1), 'tr, td ='); 748 $processed = htmLawed($in, array('elements'=>'tr, td', 'tidy'=>-1), 'tr, td =');
749 749
750 750
751== 3 Details =====================================================oo 751== 3 Details =====================================================oo
752 752
753 753
754-- 3.1 Invalid/dangerous characters -------------------------------- 754-- 3.1 Invalid/dangerous characters --------------------------------
755 755
756 756
757 Valid characters (more correctly, their code-points) in HTML or XML are, hexadecimally, '9', 'a', 'd', '20' to 'd7ff', and 'e000' to '10ffff', except 'fffe' and 'ffff' (decimally, '9', '10', '13', '32' to '55295', and '57344' to '1114111', except '65534' and '65535'). htmLawed removes the invalid characters '0' to '8', 'b', 'c', and 'e' to '1f'. 757 Valid characters (more correctly, their code-points) in HTML or XML are, hexadecimally, '9', 'a', 'd', '20' to 'd7ff', and 'e000' to '10ffff', except 'fffe' and 'ffff' (decimally, '9', '10', '13', '32' to '55295', and '57344' to '1114111', except '65534' and '65535'). htmLawed removes the invalid characters '0' to '8', 'b', 'c', and 'e' to '1f'.
758 758
759 Because of PHP's poor native support for multi-byte characters, htmLawed cannot check for the remaining invalid code-points. However, for various reasons, it is very unlikely for any of those characters to be in the input. 759 Because of PHP's poor native support for multi-byte characters, htmLawed cannot check for the remaining invalid code-points. However, for various reasons, it is very unlikely for any of those characters to be in the input.
760 760
761 Characters that are discouraged (see section:- #5.1) but not invalid are not removed by htmLawed. 761 Characters that are discouraged (see section:- #5.1) but not invalid are not removed by htmLawed.
762 762
763 It (function 'hl_tag()') also replaces the potentially dangerous (in some Mozilla [Firefox] and Opera browsers) soft-hyphen character (code-point, hexadecimally, 'ad', or decimally, '173') in attribute values with spaces. Where required, the characters '<', '>', '&', and '"' are converted to entities. 763 It (function 'hl_tag()') also replaces the potentially dangerous (in some Mozilla [Firefox] and Opera browsers) soft-hyphen character (code-point, hexadecimally, 'ad', or decimally, '173') in attribute values with spaces. Where required, the characters '<', '>', '&', and '"' are converted to entities.
764 764
765 With '$config["clean_ms_char"]' set as '1' or '2', many of the discouraged characters (decimal code-points '127' to '159' except '133') that many Microsoft applications incorrectly use (as per the 'Windows 1252' ['Cp-1252'] or a similar encoding system), and the character for decimal code-point '133', are converted to appropriate decimal numerical entities (or removed for a few cases)-- see appendix in section:- #5.4. This can help avoid some display issues arising from copying-pasting of content. 765 With '$config["clean_ms_char"]' set as '1' or '2', many of the discouraged characters (decimal code-points '127' to '159' except '133') that many Microsoft applications incorrectly use (as per the 'Windows 1252' ['Cp-1252'] or a similar encoding system), and the character for decimal code-point '133', are converted to appropriate decimal numerical entities (or removed for a few cases)-- see appendix in section:- #5.4. This can help avoid some display issues arising from copying-pasting of content.
766 766
767 With '$config["clean_ms_char"]' set as '2', characters for the hexadecimal code-points '82', '91', and '92' (for special single-quotes), and '84', '93', and '94' (for special double-quotes) are converted to ordinary single and double quotes respectively and not to entities. 767 With '$config["clean_ms_char"]' set as '2', characters for the hexadecimal code-points '82', '91', and '92' (for special single-quotes), and '84', '93', and '94' (for special double-quotes) are converted to ordinary single and double quotes respectively and not to entities.
768 768
769 The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text. 769 The character values are replaced with entities/characters and not character values referred to by the entities/characters to keep this task independent of the character-encoding of input text.
770 770
771 The '$config["clean_ms_char"]' parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the 'Windows 1252' ('Cp-1252') or a similar encoding like 'Cp-1251' (otherwise, for example when UTF-8 encoding is in use, Japanese or Korean characters can get mangled). Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up. 771 The '$config["clean_ms_char"]' parameter should not be used if authors do not copy-paste Microsoft-created text, or if the input text is not believed to use the 'Windows 1252' ('Cp-1252') or a similar encoding like 'Cp-1251' (otherwise, for example when UTF-8 encoding is in use, Japanese or Korean characters can get mangled). Further, the input form and the web-pages displaying it or its content should have the character encoding appropriately marked-up.
772 772
773 773
774-- 3.2 Character references/entities ------------------------------o 774-- 3.2 Character references/entities ------------------------------o
775 775
776 776
777 Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like '&#xA0;' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like '&nbsp;' for non-breaking space), or '#' followed by a number (decimal numeric entity; like '&#160;' for non-breaking space). Character entities referring to the soft-hyphen character (the '&shy;' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers. 777 Valid character entities take the form '&*;' where '*' is '#x' followed by a hexadecimal number (hexadecimal numeric entity; like '&#xA0;' for non-breaking space), or alphanumeric like 'gt' (external or named entity; like '&nbsp;' for non-breaking space), or '#' followed by a number (decimal numeric entity; like '&#160;' for non-breaking space). Character entities referring to the soft-hyphen character (the '&shy;' or '\xad' character; hexadecimal code-point 'ad' [decimal '173']) in URL-accepting attribute values are always replaced with spaces; soft-hyphens in attribute values introduce vulnerabilities in some older versions of the Opera and Mozilla [Firefox] browsers.
778 778
779 htmLawed (function 'hl_ent()'): 779 htmLawed (function 'hl_ent()'):
780 780
781 * Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous) 781 * Neutralizes entities with multiple leading zeroes or missing semi-colons (potentially dangerous)
782 782
783 * Lowercases the 'X' (for XML-compliance) and 'A-F' of hexadecimal numeric entities 783 * Lowercases the 'X' (for XML-compliance) and 'A-F' of hexadecimal numeric entities
784 784
785 * Neutralizes entities referring to characters that are HTML-invalid (see section:- #3.1) 785 * Neutralizes entities referring to characters that are HTML-invalid (see section:- #3.1)
786 786
787 * Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, '7f' to '84', '86' to '9f', and 'fdd0' to 'fddf', or decimally, '127' to '132', '134' to '159', and '64991' to '64976'). Entities referring to the remaining discouraged characters (see section:- #5.1 for a full list) are let through. 787 * Neutralizes entities referring to characters that are HTML-discouraged (code-points, hexadecimally, '7f' to '84', '86' to '9f', and 'fdd0' to 'fddf', or decimally, '127' to '132', '134' to '159', and '64991' to '64976'). Entities referring to the remaining discouraged characters (see section:- #5.1 for a full list) are let through.
788 788
789 * Neutralizes named entities that are not in the specifications 789 * Neutralizes named entities that are not in the specifications
790 790
791 * Optionally converts valid HTML-specific named entities except '&gt;', '&lt;', '&quot;', and '&amp;' to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is '2') for generic XML-compliance. For this, '$config["named_entity"]' should be '1'. 791 * Optionally converts valid HTML-specific named entities except '&gt;', '&lt;', '&quot;', and '&amp;' to decimal numeric ones (hexadecimal if $config["hexdec_entity"] is '2') for generic XML-compliance. For this, '$config["named_entity"]' should be '1'.
792 792
793 * Optionally converts hexadecimal numeric entities to the more widely supported decimal ones. For this, '$config["hexdec_entity"]' should be '0'. 793 * Optionally converts hexadecimal numeric entities to the more widely supported decimal ones. For this, '$config["hexdec_entity"]' should be '0'.
794 794
795 * Optionally converts decimal numeric entities to the hexadecimal ones. For this, '$config["hexdec_entity"]' should be '2'. 795 * Optionally converts decimal numeric entities to the hexadecimal ones. For this, '$config["hexdec_entity"]' should be '2'.
796 796
797 `Neutralization` refers to the `entitification` of '&' to '&amp;'. 797 `Neutralization` refers to the `entitification` of '&' to '&amp;'.
798 798
799 *Note*: htmLawed does not convert entities to the actual characters represented by them; one can pass the htmLawed output through PHP's 'html_entity_decode' function:- http://www.php.net/html_entity_decode for that. 799 *Note*: htmLawed does not convert entities to the actual characters represented by them; one can pass the htmLawed output through PHP's 'html_entity_decode' function:- http://www.php.net/html_entity_decode for that.
800 800
801 *Note*: If '$config["and_mark"]' is set, and set to a value other than '0', then the '&' characters in the original input are replaced with the control character for the hexadecimal code-point '6' ('\x06'; '&' characters introduced by htmLawed, e.g., after converting '<' to '&lt;', are not affected). This allows one to distinguish, say, an '&gt;' introduced by htmLawed and an '&gt;' put in by the input writer, and can be helpful in further processing of the htmLawed-processed text (e.g., to identify the character sequence 'o(><)o' to generate an emoticon image). When this feature is active, admins should ensure that the htmLawed output is not directly used in web pages or XML documents as the presence of the '\x06' can break documents. Before use in such documents, and preferably before any storage, any remaining '\x06' should be changed back to '&', e.g., with: 801 *Note*: If '$config["and_mark"]' is set, and set to a value other than '0', then the '&' characters in the original input are replaced with the control character for the hexadecimal code-point '6' ('\x06'; '&' characters introduced by htmLawed, e.g., after converting '<' to '&lt;', are not affected). This allows one to distinguish, say, an '&gt;' introduced by htmLawed and an '&gt;' put in by the input writer, and can be helpful in further processing of the htmLawed-processed text (e.g., to identify the character sequence 'o(><)o' to generate an emoticon image). When this feature is active, admins should ensure that the htmLawed output is not directly used in web pages or XML documents as the presence of the '\x06' can break documents. Before use in such documents, and preferably before any storage, any remaining '\x06' should be changed back to '&', e.g., with:
802 802
803 $final = str_replace("\x06", '&', $prelim); 803 $final = str_replace("\x06", '&', $prelim);
804 804
805 Also, see section:- #3.9. 805 Also, see section:- #3.9.
806 806
807 807
808-- 3.3 HTML elements ----------------------------------------------o 808-- 3.3 HTML elements ----------------------------------------------o
809 809
810 810
811 htmLawed can be configured to allow only certain HTML elements (tags) in the input. Disallowed elements (just tag-content, and not element-content), based on '$config["keep_bad"]', are either `neutralized` (converted to plain text by entitification of '<' and '>') or removed. 811 htmLawed can be configured to allow only certain HTML elements (tags) in the input. Disallowed elements (just tag-content, and not element-content), based on '$config["keep_bad"]', are either `neutralized` (converted to plain text by entitification of '<' and '>') or removed.
812 812
813 E.g., with only 'em' permitted: 813 E.g., with only 'em' permitted:
814 814
815 Input: 815 Input:
816 816
817 <em>My</em> website is <a href="http://a.com>a.com</a>. 817 <em>My</em> website is <a href="http://a.com>a.com</a>.
818 818
819 Output, with '$config["keep_bad"] = 0': 819 Output, with '$config["keep_bad"] = 0':
820 820
821 <em>My</em> website is a.com. 821 <em>My</em> website is a.com.
822 822
823 Output, with '$config["keep_bad"]' not '0': 823 Output, with '$config["keep_bad"]' not '0':
824 824
825 <em>My</em> website is &lt;a href=""&gt;a.com&lt;/a&gt;. 825 <em>My</em> website is &lt;a href=""&gt;a.com&lt;/a&gt;.
826 826
827 See section:- #3.3.3 for differences between the various non-zero '$config["keep_bad"]' values. 827 See section:- #3.3.3 for differences between the various non-zero '$config["keep_bad"]' values.
828 828
829 htmLawed by default permits these 118 HTML elements: 829 htmLawed by default permits these 118 HTML elements:
830 830
831 a, abbr, acronym, address, applet, area, article, aside, audio, b, bdi, bdo, big, blockquote, br, button, canvas, caption, center, cite, code, col, colgroup, command, data, datalist, dd, del, details, dfn, dir, div, dl, dt, em, embed, fieldset, figcaption, figure, font, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, iframe, img, input, ins, isindex, kbd, keygen, label, legend, li, link, main, map, mark, menu, meta, meter, nav, noscript, object, ol, optgroup, option, output, p, param, pre, progress, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, section, select, small, source, span, strike, strong, style, sub, summary, sup, table, tbody, td, textarea, tfoot, th, thead, time, tr, track, tt, u, ul, var, video, wbr 831 a, abbr, acronym, address, applet, area, article, aside, audio, b, bdi, bdo, big, blockquote, br, button, canvas, caption, center, cite, code, col, colgroup, command, data, datalist, dd, del, details, dfn, dir, div, dl, dt, em, embed, fieldset, figcaption, figure, font, footer, form, h1, h2, h3, h4, h5, h6, header, hgroup, hr, i, iframe, img, input, ins, isindex, kbd, keygen, label, legend, li, link, main, map, mark, menu, meta, meter, nav, noscript, object, ol, optgroup, option, output, p, param, pre, progress, q, rb, rbc, rp, rt, rtc, ruby, s, samp, script, section, select, small, source, span, strike, strong, style, sub, summary, sup, table, tbody, td, textarea, tfoot, th, thead, time, tr, track, tt, u, ul, var, video, wbr
832 832
833 The HTML version 4 elements 'acronym', 'applet', 'big', 'center', 'dir', 'font', 'strike', and 'tt' are obsolete/deprecated in HTML version 5. On the other hand, the obsolete/deprecated HTML 4 elements 'embed', 'menu' and 'u' are no longer so in HTML 5. Elements new to HTML 5 are 'article', 'aside', 'audio', 'bdi', 'canvas', 'command', 'data', 'datalist', 'details', 'figure', 'figcaption', 'footer', 'header', 'hgroup', 'keygen', 'link', 'main', 'mark', 'meta', 'meter', 'nav', 'output', 'progress', 'section', 'source', 'style', 'summary', 'time', 'track', 'video', and 'wbr'. The 'link', 'meta' and 'style' elements exist in HTML 4 but are not allowed in the HTML body. These 16 elements are `empty` elements that have an opening tag with possible content but no element content (thus, no closing tag): 'area', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', and 'wbr'. 833 The HTML version 4 elements 'acronym', 'applet', 'big', 'center', 'dir', 'font', 'strike', and 'tt' are obsolete/deprecated in HTML version 5. On the other hand, the obsolete/deprecated HTML 4 elements 'embed', 'menu' and 'u' are no longer so in HTML 5. Elements new to HTML 5 are 'article', 'aside', 'audio', 'bdi', 'canvas', 'command', 'data', 'datalist', 'details', 'figure', 'figcaption', 'footer', 'header', 'hgroup', 'keygen', 'link', 'main', 'mark', 'meta', 'meter', 'nav', 'output', 'progress', 'section', 'source', 'style', 'summary', 'time', 'track', 'video', and 'wbr'. The 'link', 'meta' and 'style' elements exist in HTML 4 but are not allowed in the HTML body. These 16 elements are `empty` elements that have an opening tag with possible content but no element content (thus, no closing tag): 'area', 'br', 'col', 'command', 'embed', 'hr', 'img', 'input', 'isindex', 'keygen', 'link', 'meta', 'param', 'source', 'track', and 'wbr'.
834 834
835 With '$config["safe"] = 1', the default set will exclude 'applet', 'audio', 'canvas', 'embed', 'iframe', 'object', 'script' and 'video'; see section:- #3.6. 835 With '$config["safe"] = 1', the default set will exclude 'applet', 'audio', 'canvas', 'embed', 'iframe', 'object', 'script' and 'video'; see section:- #3.6.
836 836
837 When '$config["elements"]', which specifies allowed elements, is `properly` defined, and neither empty nor set to '0' or '*', the default set is not used. To have elements added to or removed from the default set, a '+/-' notation is used. E.g., '*-script-object' implies that only 'script' and 'object' are disallowed, whereas '*+embed' means that 'noembed' is also allowed. Elements can also be specified as comma separated names. E.g., 'a, b, i' means only 'a', 'b' and 'i' are permitted. In this notation, '*', '+' and '-' have no significance and can actually cause a mis-reading. 837 When '$config["elements"]', which specifies allowed elements, is `properly` defined, and neither empty nor set to '0' or '*', the default set is not used. To have elements added to or removed from the default set, a '+/-' notation is used. E.g., '*-script-object' implies that only 'script' and 'object' are disallowed, whereas '*+embed' means that 'noembed' is also allowed. Elements can also be specified as comma separated names. E.g., 'a, b, i' means only 'a', 'b' and 'i' are permitted. In this notation, '*', '+' and '-' have no significance and can actually cause a mis-reading.
838 838
839 Some more examples of '$config["elements"]' values indicating permitted elements (note that empty spaces are liberally allowed for clarity): 839 Some more examples of '$config["elements"]' values indicating permitted elements (note that empty spaces are liberally allowed for clarity):
840 840
841 * 'a, blockquote, code, em, strong' -- only 'a', 'blockquote', 'code', 'em', and 'strong' 841 * 'a, blockquote, code, em, strong' -- only 'a', 'blockquote', 'code', 'em', and 'strong'
842 * '*-script' -- all excluding 'script' 842 * '*-script' -- all excluding 'script'
843 * '* -acronym -big -center -dir -font -isindex -s -strike -tt' -- only non-obsolete/deprecated elements of HTML5 843 * '* -acronym -big -center -dir -font -isindex -s -strike -tt' -- only non-obsolete/deprecated elements of HTML5
844 * '*+noembed-script' -- all including 'noembed' excluding 'script' 844 * '*+noembed-script' -- all including 'noembed' excluding 'script'
845 845
846 Some mis-usages (and the resulting permitted elements) that can be avoided: 846 Some mis-usages (and the resulting permitted elements) that can be avoided:
847 847
848 * '-*' -- none; instead of htmLawed, one might just use, e.g., the 'htmlspecialchars()' PHP function 848 * '-*' -- none; instead of htmLawed, one might just use, e.g., the 'htmlspecialchars()' PHP function
849 * '*, -script' -- all except 'script'; admin probably meant '*-script' 849 * '*, -script' -- all except 'script'; admin probably meant '*-script'
850 * '-*, a, em, strong' -- all; admin probably meant 'a, em, strong' 850 * '-*, a, em, strong' -- all; admin probably meant 'a, em, strong'
851 * '*' -- all; admin need not have set 'elements' 851 * '*' -- all; admin need not have set 'elements'
852 * '*-form+form' -- all; a '+' will always over-ride any '-' 852 * '*-form+form' -- all; a '+' will always over-ride any '-'
853 * '*, noembed' -- only 'noembed'; admin probably meant '*+noembed' 853 * '*, noembed' -- only 'noembed'; admin probably meant '*+noembed'
854 * 'a, +b, i' -- only 'a' and 'i'; admin probably meant 'a, b, i' 854 * 'a, +b, i' -- only 'a' and 'i'; admin probably meant 'a, b, i'
855 855
856 Basically, when using the '+/-' notation, commas (',') should not be used, and vice versa, and '*' should be used with the former but not the latter. 856 Basically, when using the '+/-' notation, commas (',') should not be used, and vice versa, and '*' should be used with the former but not the latter.
857 857
858 *Note*: Even if an element that is not in the default set is allowed through '$config["elements"]', like 'noembed' in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ('$config["balance"]' set to '0'). Currently, the only way around this, which actually is simple, is to edit htmLawed's PHP code which define various arrays in the function 'hl_bal()' to accommodate the element and its nesting properties. 858 *Note*: Even if an element that is not in the default set is allowed through '$config["elements"]', like 'noembed' in the last example, it will eventually be removed during tag balancing unless such balancing is turned off ('$config["balance"]' set to '0'). Currently, the only way around this, which actually is simple, is to edit htmLawed's PHP code which define various arrays in the function 'hl_bal()' to accommodate the element and its nesting properties.
859 859
860 A possible second way to specify allowed elements is to set '$config["parent"]' to an element name that supposedly will hold the input, and to set '$config["balance"]' to '1'. During tag balancing (see section:- #3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to 'div' if '$config["parent"]' is empty, 'body', or an element not in htmLawed's default set of 118 elements. 860 A possible second way to specify allowed elements is to set '$config["parent"]' to an element name that supposedly will hold the input, and to set '$config["balance"]' to '1'. During tag balancing (see section:- #3.3.3), all elements that cannot legally nest inside the parent element will be removed. The parent element is auto-reset to 'div' if '$config["parent"]' is empty, 'body', or an element not in htmLawed's default set of 118 elements.
861 861
862 `Tag transformation` is possible for improving compliance with HTML standards -- most of the obsolete/deprecated elements of HTML version 5 are converted to valid ones; see section:- #3.3.2. 862 `Tag transformation` is possible for improving compliance with HTML standards -- most of the obsolete/deprecated elements of HTML version 5 are converted to valid ones; see section:- #3.3.2.
863 863
864 864
865.. 3.3.1 Handling of comments & CDATA sections ..................... 865.. 3.3.1 Handling of comments & CDATA sections .....................
866 866
867 867
868 'CDATA' sections have the format '<![CDATA[...anything but not "]]>"...]]>', and HTML comments, '<!--...anything but not "-->"... -->'. Neither HTML comments nor 'CDATA' sections can reside inside tags. HTML comments can exist anywhere else, but 'CDATA' sections can exist only where plain text is allowed (e.g., immediately inside 'td' element content but not immediately inside 'tr' element content). 868 'CDATA' sections have the format '<![CDATA[...anything but not "]]>"...]]>', and HTML comments, '<!--...anything but not "-->"... -->'. Neither HTML comments nor 'CDATA' sections can reside inside tags. HTML comments can exist anywhere else, but 'CDATA' sections can exist only where plain text is allowed (e.g., immediately inside 'td' element content but not immediately inside 'tr' element content).
869 869
870 htmLawed (function 'hl_cmtcd()') handles HTML comments or 'CDATA' sections depending on the values of '$config["comment"]' or '$config["cdata"]'. If '0', such markup is not looked for and the text is processed like plain text. If '1', it is removed completely. If '2', it is preserved but any '<', '>' and '&' inside are changed to entities. If '3' for '$config["cdata"]', or '3' or '4' for '$config["comment"]', they are left as such. When '$config["comment"]' is set to '4', htmLawed will not force a space character before the '-->' comment-closing marker. While such a space is required for standard-compliance, it can corrupt marker code put in HTML by some software (such as Microsoft Outlook). 870 htmLawed (function 'hl_cmtcd()') handles HTML comments or 'CDATA' sections depending on the values of '$config["comment"]' or '$config["cdata"]'. If '0', such markup is not looked for and the text is processed like plain text. If '1', it is removed completely. If '2', it is preserved but any '<', '>' and '&' inside are changed to entities. If '3' for '$config["cdata"]', or '3' or '4' for '$config["comment"]', they are left as such. When '$config["comment"]' is set to '4', htmLawed will not force a space character before the '-->' comment-closing marker. While such a space is required for standard-compliance, it can corrupt marker code put in HTML by some software (such as Microsoft Outlook).
871 871
872 Note that for the last two cases, HTML comments and 'CDATA' sections will always be removed from tag content (function 'hl_tag()'). 872 Note that for the last two cases, HTML comments and 'CDATA' sections will always be removed from tag content (function 'hl_tag()').
873 873
874 Examples: 874 Examples:
875 875
876 Input: 876 Input:
877 <!-- home link--><a href="home.htm"><![CDATA[x=&y]]>Home</a> 877 <!-- home link--><a href="home.htm"><![CDATA[x=&y]]>Home</a>
878 Output ('$config["comment"] = 0, $config["cdata"] = 2'): 878 Output ('$config["comment"] = 0, $config["cdata"] = 2'):
879 &lt;-- home link--&gt;<a href="home.htm"><![CDATA[x=&amp;y]]>Home</a> 879 &lt;-- home link--&gt;<a href="home.htm"><![CDATA[x=&amp;y]]>Home</a>
880 Output ('$config["comment"] = 1, $config["cdata"] = 2'): 880 Output ('$config["comment"] = 1, $config["cdata"] = 2'):
881 <a href="home.htm"><![CDATA[x=&amp;y]]>Home</a> 881 <a href="home.htm"><![CDATA[x=&amp;y]]>Home</a>
882 Output ('$config["comment"] = 2, $config["cdata"] = 2'): 882 Output ('$config["comment"] = 2, $config["cdata"] = 2'):
883 <!-- home link --><a href="home.htm"><![CDATA[x=&amp;y]]>Home</a> 883 <!-- home link --><a href="home.htm"><![CDATA[x=&amp;y]]>Home</a>
884 Output ('$config["comment"] = 2, $config["cdata"] = 1'): 884 Output ('$config["comment"] = 2, $config["cdata"] = 1'):
885 <!-- home link --><a href="home.htm">Home</a> 885 <!-- home link --><a href="home.htm">Home</a>
886 Output ('$config["comment"] = 3, $config["cdata"] = 3'): 886 Output ('$config["comment"] = 3, $config["cdata"] = 3'):
887 <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a> 887 <!-- home link --><a href="home.htm"><![CDATA[x=&y]]>Home</a>
888 Output ('$config["comment"] = 4, $config["cdata"] = 3'): 888 Output ('$config["comment"] = 4, $config["cdata"] = 3'):
889 <!-- home link--><a href="home.htm"><![CDATA[x=&y]]>Home</a> 889 <!-- home link--><a href="home.htm"><![CDATA[x=&y]]>Home</a>
890 890
891 For standard-compliance, comments are given the form '<!--comment -->', and any '--' in the content is made '-'. When '$config["comment"]' is set to '4', htmLawed will not force a space character before the '-->' comment-closing marker. 891 For standard-compliance, comments are given the form '<!--comment -->', and any '--' in the content is made '-'. When '$config["comment"]' is set to '4', htmLawed will not force a space character before the '-->' comment-closing marker.
892 892
893 When '$config["safe"] = 1', CDATA sections and comments are considered plain text unless '$config["comment"]' or '$config["cdata"]' is explicitly specified; see section:- #3.6. 893 When '$config["safe"] = 1', CDATA sections and comments are considered plain text unless '$config["comment"]' or '$config["cdata"]' is explicitly specified; see section:- #3.6.
894 894
895 895
896.. 3.3.2 Tag-transformation for better compliance with standards ..o 896.. 3.3.2 Tag-transformation for better compliance with standards ..o
897 897
898 898
899 If '$config["make_tag_strict"]' is set and not '0', following deprecated elements (and attributes), as per HTML 5 specification, even if admin-permitted, are mutated as indicated (element content remains intact; function 'hl_tag2()'): 899 If '$config["make_tag_strict"]' is set and not '0', following deprecated elements (and attributes), as per HTML 5 specification, even if admin-permitted, are mutated as indicated (element content remains intact; function 'hl_tag2()'):
900 900
901 * acronym - 'abbr' 901 * acronym - 'abbr'
902 * applet - based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2') 902 * applet - based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2')
903 * big - 'span style="font-size: larger;"' 903 * big - 'span style="font-size: larger;"'
904 * center - 'div style="text-align: center;"' 904 * center - 'div style="text-align: center;"'
905 * dir - 'ul' 905 * dir - 'ul'
906 * font (face, size, color) - 'span style="font-family: ; font-size: ; color: ;"' (size transformation reference:- http://web.archive.org/web/20180201141931/http://style.cleverchimp.com/font_size_intervals/altintervals.html) 906 * font (face, size, color) - 'span style="font-family: ; font-size: ; color: ;"' (size transformation reference:- http://web.archive.org/web/20180201141931/http://style.cleverchimp.com/font_size_intervals/altintervals.html)
907 * isindex - based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2') 907 * isindex - based on '$config["make_tag_strict"]', unchanged ('1') or removed ('2')
908 * s - 'span style="text-decoration: line-through;"' 908 * s - 'span style="text-decoration: line-through;"'
909 * strike - 'span style="text-decoration: line-through;"' 909 * strike - 'span style="text-decoration: line-through;"'
910 * tt - 'code' 910 * tt - 'code'
911 911
912 For an element with a pre-existing 'style' attribute value, the extra style properties are appended. 912 For an element with a pre-existing 'style' attribute value, the extra style properties are appended.
913 913
914 Example input: 914 Example input:
915 915
916 <center> 916 <center>
917 The PHP <s>software</s> script used for this <strike>web-page</strike> web-page is <font style="font-weight: bold " face=arial size='+3' color = "red ">htmLawedTest.php</font>, from <u style= 'color:green'>PHP Labware</u>. 917 The PHP <s>software</s> script used for this <strike>web-page</strike> web-page is <font style="font-weight: bold " face=arial size='+3' color = "red ">htmLawedTest.php</font>, from <u style= 'color:green'>PHP Labware</u>.
918 </center> 918 </center>
919 919
920 The output: 920 The output:
921 921
922 <div style="text-align: center;"> 922 <div style="text-align: center;">
923 The PHP <span style="text-decoration: line-through;">software</span> script used for this <span style="text-decoration: line-through;">web-page</span> web-page is <span style="font-weight: bold; font-size: 200%; color: red; font-family: arial;">htmLawedTest.php</span>, from <u style="color:green">PHP Labware</u>. 923 The PHP <span style="text-decoration: line-through;">software</span> script used for this <span style="text-decoration: line-through;">web-page</span> web-page is <span style="font-weight: bold; font-size: 200%; color: red; font-family: arial;">htmLawedTest.php</span>, from <u style="color:green">PHP Labware</u>.
924 </div> 924 </div>
925 925
926 926
927.. 3.3.3 Tag balancing & proper nesting ...........................o 927.. 3.3.3 Tag balancing & proper nesting ...........................o
928 928
929 929
930 If '$config["balance"]' is set to '1', htmLawed (function 'hl_bal()') checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them). 930 If '$config["balance"]' is set to '1', htmLawed (function 'hl_bal()') checks and corrects the input to have properly balanced tags and legal element content (i.e., any element nesting should be valid, and plain text may be present only in the content of elements that allow them).
931 931
932 Depending on the value of '$config["keep_bad"]' (see section:- #2.2 and section:- #3.3), illegal content may be removed or neutralized to plain text by converting < and > to entities: 932 Depending on the value of '$config["keep_bad"]' (see section:- #2.2 and section:- #3.3), illegal content may be removed or neutralized to plain text by converting < and > to entities:
933 933
934 '0' - remove; this option is available only to maintain Kses-compatibility and should not be used otherwise (see section:- #2.6) 934 '0' - remove; this option is available only to maintain Kses-compatibility and should not be used otherwise (see section:- #2.6)
935 '1' - neutralize tags and keep element content 935 '1' - neutralize tags and keep element content
936 '2' - remove tags but keep element content 936 '2' - remove tags but keep element content
937 '3' and '4' - like '1' and '2', but keep element content only if text ('pcdata') is valid in parent element as per specs 937 '3' and '4' - like '1' and '2', but keep element content only if text ('pcdata') is valid in parent element as per specs
938 '5' and '6' - like '3' and '4', but line-breaks, tabs and spaces are left 938 '5' and '6' - like '3' and '4', but line-breaks, tabs and spaces are left
939 939
940 Example input (disallowing the 'p' element): 940 Example input (disallowing the 'p' element):
941 941
942 <*> Pseudo-tags <*> 942 <*> Pseudo-tags <*>
943 <xml>Non-HTML tag xml</xml> 943 <xml>Non-HTML tag xml</xml>
944 <p> 944 <p>
945 Disallowed tag p 945 Disallowed tag p
946 </p> 946 </p>
947 <ul>Bad<li>OK</li></ul> 947 <ul>Bad<li>OK</li></ul>
948 948
949 The output with '$config["keep_bad"] = 1': 949 The output with '$config["keep_bad"] = 1':
950 950
951 &lt;*&gt; Pseudo-tags &lt;*&gt; 951 &lt;*&gt; Pseudo-tags &lt;*&gt;
952 &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt; 952 &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt;
953 &lt;p&gt; 953 &lt;p&gt;
954 Disallowed tag p 954 Disallowed tag p
955 &lt;/p&gt; 955 &lt;/p&gt;
956 <ul>Bad<li>OK</li></ul> 956 <ul>Bad<li>OK</li></ul>
957 957
958 The output with '$config["keep_bad"] = 3': 958 The output with '$config["keep_bad"] = 3':
959 959
960 &lt;*&gt; Pseudo-tags &lt;*&gt; 960 &lt;*&gt; Pseudo-tags &lt;*&gt;
961 &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt; 961 &lt;xml&gt;Non-HTML tag xml&lt;/xml&gt;
962 &lt;p&gt; 962 &lt;p&gt;
963 Disallowed tag p 963 Disallowed tag p
964 &lt;/p&gt; 964 &lt;/p&gt;
965 <ul><li>OK</li></ul> 965 <ul><li>OK</li></ul>
966 966
967 The output with '$config["keep_bad"] = 6': 967 The output with '$config["keep_bad"] = 6':
968 968
969 &lt;*&gt; Pseudo-tags &lt;*&gt; 969 &lt;*&gt; Pseudo-tags &lt;*&gt;
970 Non-HTML tag xml 970 Non-HTML tag xml
971 971
972 Disallowed tag p 972 Disallowed tag p
973 973
974 <ul><li>OK</li></ul> 974 <ul><li>OK</li></ul>
975 975
976 An option like '1' is useful, e.g., when a writer previews his submission, whereas one like '3' is useful before content is finalized and made available to all. 976 An option like '1' is useful, e.g., when a writer previews his submission, whereas one like '3' is useful before content is finalized and made available to all.
977 977
978 *Note:* In the example above, unlike '<*>', '<xml>' gets considered as a tag (even though there is no HTML element named 'xml'). Thus, the 'keep_bad' parameter's value affects '<xml>' but not '<*>'. In general, text matching the regular expression pattern '<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>' is considered a tag (phrase enclosed by the angled brackets '<' and '>', and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...), and is subjected to the 'keep_bad' value. 978 *Note:* In the example above, unlike '<*>', '<xml>' gets considered as a tag (even though there is no HTML element named 'xml'). Thus, the 'keep_bad' parameter's value affects '<xml>' but not '<*>'. In general, text matching the regular expression pattern '<(/?)([a-zA-Z][a-zA-Z1-6]*)([^>]*?)\s?>' is considered a tag (phrase enclosed by the angled brackets '<' and '>', and starting [with an optional slash preceding] with an alphanumeric word that starts with an alphabet...), and is subjected to the 'keep_bad' value.
979 979
980 Nesting/content rules for each of the 118 elements in htmLawed's default set (see section:- #3.3) are defined in function 'hl_bal()'. This means that if a non-standard element besides 'embed' is being permitted through '$config["elements"]', the element's tag content will end up getting removed if '$config["balance"]' is set to '1'. 980 Nesting/content rules for each of the 118 elements in htmLawed's default set (see section:- #3.3) are defined in function 'hl_bal()'. This means that if a non-standard element besides 'embed' is being permitted through '$config["elements"]', the element's tag content will end up getting removed if '$config["balance"]' is set to '1'.
981 981
982 Plain text and/or certain elements nested inside 'blockquote', 'form', 'map' and 'noscript' need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as 'form', the input 'B:<input type="text" value="b" />C:<input type="text" value="c" />' is converted to '<div>B:<input type="text" value="b" />C:<input type="text" value="c" /></div>'. 982 Plain text and/or certain elements nested inside 'blockquote', 'form', 'map' and 'noscript' need to be in block-level elements. This point is often missed during manual writing of HTML code. htmLawed attempts to address this during balancing. E.g., if the parent container is set as 'form', the input 'B:<input type="text" value="b" />C:<input type="text" value="c" />' is converted to '<div>B:<input type="text" value="b" />C:<input type="text" value="c" /></div>'.
983 983
984 984
985.. 3.3.4 Elements requiring child elements ........................o 985.. 3.3.4 Elements requiring child elements ........................o
986 986
987 987
988 As per HTML specifications, elements such as those below require legal child elements nested inside them: 988 As per HTML specifications, elements such as those below require legal child elements nested inside them:
989 989
990 blockquote, dir, dl, form, map, menu, noscript, ol, optgroup, rbc, rtc, ruby, select, table, tbody, tfoot, thead, tr, ul 990 blockquote, dir, dl, form, map, menu, noscript, ol, optgroup, rbc, rtc, ruby, select, table, tbody, tfoot, thead, tr, ul
991 991
992 In some cases, the specifications stipulate the number and/or the ordering of the child elements. A 'table' can have 0 or 1 'caption', 'tbody', 'tfoot', and 'thead', but they must be in this order: 'caption', 'thead', 'tfoot', 'tbody'. 992 In some cases, the specifications stipulate the number and/or the ordering of the child elements. A 'table' can have 0 or 1 'caption', 'tbody', 'tfoot', and 'thead', but they must be in this order: 'caption', 'thead', 'tfoot', 'tbody'.
993 993
994 htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages. 994 htmLawed currently does not check for conformance to these rules. Note that any non-compliance in this regard will not introduce security vulnerabilities, crash browser applications, or affect the rendering of web-pages.
995 995
996 With '$config["direct_list_nest"]' set to '1', htmLawed will allow direct nesting of 'ol', 'ul', or 'menu' list within another 'ol', 'ul', or 'menu' without requiring the child list to be within an 'li' of the parent list. While this may not be standard-compliant, directly nested lists are rendered properly by almost all browsers. The parameter '$config["direct_list_nest"]' has no effect if tag balancing (section:- #3.3.3) is turned off. 996 With '$config["direct_list_nest"]' set to '1', htmLawed will allow direct nesting of 'ol', 'ul', or 'menu' list within another 'ol', 'ul', or 'menu' without requiring the child list to be within an 'li' of the parent list. While this may not be standard-compliant, directly nested lists are rendered properly by almost all browsers. The parameter '$config["direct_list_nest"]' has no effect if tag balancing (section:- #3.3.3) is turned off.
997 997
998 998
999.. 3.3.5 Beautify or compact HTML .................................o 999.. 3.3.5 Beautify or compact HTML .................................o
1000 1000
1001 1001
1002 By default, htmLawed will neither `beautify` HTML code by formatting it with indentations, etc., nor will it make it compact by removing un-needed white-space.(It does always properly white-space tag content.) 1002 By default, htmLawed will neither `beautify` HTML code by formatting it with indentations, etc., nor will it make it compact by removing un-needed white-space.(It does always properly white-space tag content.)
1003 1003
1004 As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside 'pre' elements) are all considered equivalent, and referred to as `white-spaces`. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space `normalization` allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such `pretty` HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome. 1004 As per the HTML standards, spaces, tabs and line-breaks in web-pages (except those inside 'pre' elements) are all considered equivalent, and referred to as `white-spaces`. Browser applications are supposed to consider contiguous white-spaces as just a single space, and to disregard white-spaces trailing opening tags or preceding closing tags. This white-space `normalization` allows the use of text/code beautifully formatted with indentations and line-spacings for readability. Such `pretty` HTML can, however, increase the size of web-pages, or make the extraction or scraping of plain text cumbersome.
1005 1005
1006 With the '$config' parameter 'tidy', htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides 'pre', the 'script' and 'textarea' elements, CDATA sections, and HTML comments are not subjected to the tidying process. 1006 With the '$config' parameter 'tidy', htmLawed can be used to beautify or compact the input text. Input with just plain text and no HTML markup is also subject to this. Besides 'pre', the 'script' and 'textarea' elements, CDATA sections, and HTML comments are not subjected to the tidying process.
1007 1007
1008 To `compact`, use '$config["tidy"] = -1'; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed. 1008 To `compact`, use '$config["tidy"] = -1'; single instances or runs of white-spaces are replaced with a single space, and white-spaces trailing and leading open and closing tags, respectively, are removed.
1009 1009
1010 To `beautify`, '$config["tidy"]' is set as '1', or for customized tidying, as a string like '2s2n'. The 's' or 't' character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The 'r' and 'n' characters are used to specify line-break characters: 'n' for '\n' (Unix/Mac OS X line-breaks), 'rn' or 'nr' for '\r\n' (Windows/DOS line-breaks), or 'r' for '\r'. 1010 To `beautify`, '$config["tidy"]' is set as '1', or for customized tidying, as a string like '2s2n'. The 's' or 't' character specifies the use of spaces or tabs for indentation. The first and third characters, any of the digits 0-9, specify the number of spaces or tabs per indentation, and any parental lead spacing (extra indenting of the whole block of input text). The 'r' and 'n' characters are used to specify line-break characters: 'n' for '\n' (Unix/Mac OS X line-breaks), 'rn' or 'nr' for '\r\n' (Windows/DOS line-breaks), or 'r' for '\r'.
1011 1011
1012 The '$config["tidy"]' value of '1' is equivalent to '2s0n'. Other '$config["tidy"]' values are read loosely: a value of '4' is equivalent to '4s0n'; 't2', to '1t2n'; 's', to '2s0n'; '2TR', to '2t0r'; 'T1', to '1t1n'; 'nr3', to '3s0nr', and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification. 1012 The '$config["tidy"]' value of '1' is equivalent to '2s0n'. Other '$config["tidy"]' values are read loosely: a value of '4' is equivalent to '4s0n'; 't2', to '1t2n'; 's', to '2s0n'; '2TR', to '2t0r'; 'T1', to '1t1n'; 'nr3', to '3s0nr', and so on. Except in the indentations and line-spacings, runs of white-spaces are replaced with a single space during beautification.
1013 1013
1014 Input formatting using '$config["tidy"]' is not recommended when input text has mixed markup (like HTML + PHP). 1014 Input formatting using '$config["tidy"]' is not recommended when input text has mixed markup (like HTML + PHP).
1015 1015
1016 1016
1017-- 3.4 Attributes -------------------------------------------------o 1017-- 3.4 Attributes -------------------------------------------------o
1018 1018
1019 1019
1020 In its default setting, htmLawed will only permit attributes described in the HTML specifications (including deprecated ones). A list of the attributes and the elements they are allowed in is in section:- #5.2. Using the '$spec' argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (section:- #2.3). 1020 In its default setting, htmLawed will only permit attributes described in the HTML specifications (including deprecated ones). A list of the attributes and the elements they are allowed in is in section:- #5.2. Using the '$spec' argument, htmLawed can be forced to permit custom, non-standard attributes as well as custom rules for standard attributes (section:- #2.3).
1021 1021
1022 Custom `data-*` (`data-star`) attributes, where the first three characters of the value of `star` (*) after lower-casing do not equal 'xml', and the value of `star` does not have a colon (:), equal-to (=), newline, solidus (/), space or tab character, or any upper-case A-Z character are allowed in all elements. ARIA, event and microdata attributes like 'aria-live', 'onclick' and 'itemid' are also considered global attributes (section:- #5.2). 1022 Custom `data-*` (`data-star`) attributes, where the first three characters of the value of `star` (*) after lower-casing do not equal 'xml', and the value of `star` does not have a colon (:), equal-to (=), newline, solidus (/), space or tab character, or any upper-case A-Z character are allowed in all elements. ARIA, event and microdata attributes like 'aria-live', 'onclick' and 'itemid' are also considered global attributes (section:- #5.2).
1023 1023
1024 When '$config["deny_attribute"]' is not set, or set to '0', or empty ('""'), all attributes are permitted. Otherwise, '$config["deny_attribute"]' can be set as a list of comma-separated names of the denied attributes. 'on*' can be used to refer to the group of potentially dangerous, script-accepting event attributes like 'onblur' and 'onchange' that have 'on' at the beginning of their names. Similarly, 'aria*' and 'data*' can be used to respectively refer to the set of all ARIA and data-* attributes. 1024 When '$config["deny_attribute"]' is not set, or set to '0', or empty ('""'), all attributes are permitted. Otherwise, '$config["deny_attribute"]' can be set as a list of comma-separated names of the denied attributes. 'on*' can be used to refer to the group of potentially dangerous, script-accepting event attributes like 'onblur' and 'onchange' that have 'on' at the beginning of their names. Similarly, 'aria*' and 'data*' can be used to respectively refer to the set of all ARIA and data-* attributes.
1025 1025
1026 With '$config["safe"] = 1' (section:- #3.6), the 'on*' event attributes are automatically disallowed even if a value for '$config["deny_attribute"]' has been manually provided. 1026 With '$config["safe"] = 1' (section:- #3.6), the 'on*' event attributes are automatically disallowed even if a value for '$config["deny_attribute"]' has been manually provided.
1027 1027
1028 Note that attributes specified in '$config["deny_attribute"]' are denied globally, for all elements. To deny attributes for only specific elements, '$spec' (see section:- #2.3) can be used. '$spec' can also be used to element-specifically permit an attribute otherwise denied through '$config["deny_attribute"]'. 1028 Note that attributes specified in '$config["deny_attribute"]' are denied globally, for all elements. To deny attributes for only specific elements, '$spec' (see section:- #2.3) can be used. '$spec' can also be used to element-specifically permit an attribute otherwise denied through '$config["deny_attribute"]'.
1029 1029
1030 Finer restrictions on attributes can also be put into effect through '$config["deny_attribute"]' (section:- 3.4.9). 1030 Finer restrictions on attributes can also be put into effect through '$config["deny_attribute"]' (section:- 3.4.9).
1031 1031
1032 *Note*: To deny all but a few attributes globally, a simpler way to specify '$config["deny_attribute"]' would be to use the notation '* -attribute1 -attribute2 ...'. Thus, a value of '* -title -href' implies that except 'href' and 'title' (where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter 'safe' (section:- #3.6) will have no effect on 'deny_attribute'. Values of 'aria*' 'data*', and 'on*' cannot be used in this notation to refer to the sets of all ARIA, data-*, and on* attributes respectively. 1032 *Note*: To deny all but a few attributes globally, a simpler way to specify '$config["deny_attribute"]' would be to use the notation '* -attribute1 -attribute2 ...'. Thus, a value of '* -title -href' implies that except 'href' and 'title' (where allowed as per standards) all other attributes are to be removed. With this notation, the value for the parameter 'safe' (section:- #3.6) will have no effect on 'deny_attribute'. Values of 'aria*' 'data*', and 'on*' cannot be used in this notation to refer to the sets of all ARIA, data-*, and on* attributes respectively.
1033 1033
1034 htmLawed (function 'hl_tag()') also: 1034 htmLawed (function 'hl_tag()') also:
1035 1035
1036 * Lower-cases attribute names 1036 * Lower-cases attribute names
1037 * Removes duplicate attributes (last one stays) 1037 * Removes duplicate attributes (last one stays)
1038 * Gives attributes the form 'name="value"' and single-spaces them, removing unnecessary white-spacing 1038 * Gives attributes the form 'name="value"' and single-spaces them, removing unnecessary white-spacing
1039 * Provides `required` attributes (see section:- #3.4.1) 1039 * Provides `required` attributes (see section:- #3.4.1)
1040 * Double-quotes values and escapes any '"' inside them 1040 * Double-quotes values and escapes any '"' inside them
1041 * Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point 'ad') in the values with spaces 1041 * Replaces the possibly dangerous soft-hyphen characters (hexadecimal code-point 'ad') in the values with spaces
1042 * Allows custom function to additionally filter/modify attribute values (see section:- #3.4.9) 1042 * Allows custom function to additionally filter/modify attribute values (see section:- #3.4.9)
1043 1043
1044 1044
1045.. 3.4.1 Auto-addition of XHTML-required attributes ................ 1045.. 3.4.1 Auto-addition of XHTML-required attributes ................
1046 1046
1047 1047
1048 If indicated attributes for the following elements are found missing, htmLawed (function 'hl_tag()') will add them (with values same as attribute names unless indicated otherwise below): 1048 If indicated attributes for the following elements are found missing, htmLawed (function 'hl_tag()') will add them (with values same as attribute names unless indicated otherwise below):
1049 1049
1050 * area - alt ('area') 1050 * area - alt ('area')
1051 * area, img - src, alt ('image') 1051 * area, img - src, alt ('image')
1052 * bdo - dir ('ltr') 1052 * bdo - dir ('ltr')
1053 * form - action 1053 * form - action
1054 * label - command 1054 * label - command
1055 * map - name 1055 * map - name
1056 * optgroup - label 1056 * optgroup - label
1057 * param - name 1057 * param - name
1058 * style - scoped 1058 * style - scoped
1059 * textarea - rows ('10'), cols ('50') 1059 * textarea - rows ('10'), cols ('50')
1060 1060
1061 Additionally, with '$config["xml:lang"]' set to '1' or '2', if the 'lang' but not the 'xml:lang' attribute is declared, then the latter is added too, with a value copied from that of 'lang'. This is for better standard-compliance. With '$config["xml:lang"]' set to '2', the 'lang' attribute is removed (XHTML specification). 1061 Additionally, with '$config["xml:lang"]' set to '1' or '2', if the 'lang' but not the 'xml:lang' attribute is declared, then the latter is added too, with a value copied from that of 'lang'. This is for better standard-compliance. With '$config["xml:lang"]' set to '2', the 'lang' attribute is removed (XHTML specification).
1062 1062
1063 Note that the 'name' attribute for 'map', invalid in XHTML, is also transformed if required -- see section:- #3.4.6. 1063 Note that the 'name' attribute for 'map', invalid in XHTML, is also transformed if required -- see section:- #3.4.6.
1064 1064
1065 1065
1066.. 3.4.2 Duplicate/invalid 'id' values ............................o 1066.. 3.4.2 Duplicate/invalid 'id' values ............................o
1067 1067
1068 1068
1069 If '$config["unique_ids"]' is '1', htmLawed (function 'hl_tag()') removes 'id' attributes with values that are not standards-compliant (must not have a space character) or duplicate. If '$config["unique_ids"]' is a word (without a non-word character like space), any duplicate but otherwise valid value will be appropriately prefixed with the word to ensure its uniqueness. 1069 If '$config["unique_ids"]' is '1', htmLawed (function 'hl_tag()') removes 'id' attributes with values that are not standards-compliant (must not have a space character) or duplicate. If '$config["unique_ids"]' is a word (without a non-word character like space), any duplicate but otherwise valid value will be appropriately prefixed with the word to ensure its uniqueness.
1070 1070
1071 Even if multiple inputs need to be filtered (through multiple calls to htmLawed), htmLawed ensures uniqueness of 'id' values as it uses a global variable ('$GLOBALS["hl_Ids"]' array). Further, an admin can restrict the use of certain 'id' values by presetting this variable before htmLawed is called into use. E.g.: 1071 Even if multiple inputs need to be filtered (through multiple calls to htmLawed), htmLawed ensures uniqueness of 'id' values as it uses a global variable ('$GLOBALS["hl_Ids"]' array). Further, an admin can restrict the use of certain 'id' values by presetting this variable before htmLawed is called into use. E.g.:
1072 1072
1073 $GLOBALS['hl_Ids'] = array('top'=>1, 'bottom'=>1, 'myform'=>1); // id values not allowed in input 1073 $GLOBALS['hl_Ids'] = array('top'=>1, 'bottom'=>1, 'myform'=>1); // id values not allowed in input
1074 $processed = htmLawed($text); // filter input 1074 $processed = htmLawed($text); // filter input
1075 1075
1076 1076
1077.. 3.4.3 URL schemes & scripts in attribute values ................o 1077.. 3.4.3 URL schemes & scripts in attribute values ................o
1078 1078
1079 1079
1080 htmLawed edits attributes that take URLs as values if they are found to contain un-permitted schemes. E.g., if the 'afp' scheme is not permitted, then '<a href="afp://domain.org">' becomes '<a href="denied:afp://domain.org">', and if Javascript is not permitted '<a onclick="javascript:xss();">' becomes '<a onclick="denied:javascript:xss();">'. 1080 htmLawed edits attributes that take URLs as values if they are found to contain un-permitted schemes. E.g., if the 'afp' scheme is not permitted, then '<a href="afp://domain.org">' becomes '<a href="denied:afp://domain.org">', and if Javascript is not permitted '<a onclick="javascript:xss();">' becomes '<a onclick="denied:javascript:xss();">'.
1081 1081
1082 By default htmLawed permits these schemes in URLs for the 'href' attribute: 1082 By default htmLawed permits these schemes in URLs for the 'href' attribute:
1083 1083
1084 aim, app, feed, file, ftp, gopher, http, https, javascript, irc, mailto, news, nntp, sftp, ssh, tel, telnet 1084 aim, app, feed, file, ftp, gopher, http, https, javascript, irc, mailto, news, nntp, sftp, ssh, tel, telnet
1085 1085
1086 Also, only 'data', 'file', 'http', 'https' and 'javascript' are permitted in these attributes that accept URLs: 1086 Also, only 'data', 'file', 'http', 'https' and 'javascript' are permitted in these attributes that accept URLs:
1087 1087
1088 action, cite, classid, codebase, data, itemtype, longdesc, model, pluginspage, pluginurl, src, srcset, style, usemap, and event attributes like onclick 1088 action, cite, classid, codebase, data, itemtype, longdesc, model, pluginspage, pluginurl, src, srcset, style, usemap, and event attributes like onclick
1089 1089
1090 With '$config["safe"] = 1' (section:- #3.6), the above is changed to disallow 'app', 'data' and 'javascript'. 1090 With '$config["safe"] = 1' (section:- #3.6), the above is changed to disallow 'app', 'data' and 'javascript'.
1091 1091
1092 These default sets are used when '$config["schemes"]' is not set (see section:- #2.2). To over-ride the defaults, '$config["schemes"]' is defined as a string of semi-colon-separated sub-strings of type 'attribute: comma-separated schemes'. E.g., 'href: mailto, http, https; onclick: javascript; src: http, https'. For unspecified attributes, 'data', 'file', 'http', 'https' and 'javascript' are permitted. This can be changed by passing schemes for '*' in '$config["schemes"]'. E.g., 'href: mailto, http, https; *: https, https'. 1092 These default sets are used when '$config["schemes"]' is not set (see section:- #2.2). To over-ride the defaults, '$config["schemes"]' is defined as a string of semi-colon-separated sub-strings of type 'attribute: comma-separated schemes'. E.g., 'href: mailto, http, https; onclick: javascript; src: http, https'. For unspecified attributes, 'data', 'file', 'http', 'https' and 'javascript' are permitted. This can be changed by passing schemes for '*' in '$config["schemes"]'. E.g., 'href: mailto, http, https; *: https, https'.
1093 1093
1094 '*' (asterisk) can be put in the list of schemes to permit all protocols. E.g., 'style: *; img: http, https' results in protocols not being checked in 'style' attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (section:- #3.4.4) is not done. When an attribute is explicitly listed in '$config["schemes"]', then filtering is dictated by the setting for the attribute, with no effect of the setting for asterisk. That is, the set of attributes that asterisk refers to no longer includes the listed attribute. 1094 '*' (asterisk) can be put in the list of schemes to permit all protocols. E.g., 'style: *; img: http, https' results in protocols not being checked in 'style' attribute values. However, in such cases, any relative-to-absolute URL conversion, or vice versa, (section:- #3.4.4) is not done. When an attribute is explicitly listed in '$config["schemes"]', then filtering is dictated by the setting for the attribute, with no effect of the setting for asterisk. That is, the set of attributes that asterisk refers to no longer includes the listed attribute.
1095 1095
1096 Thus, `to allow the xmpp scheme`, one can set '$config["schemes"]' as 'href: mailto, http, https; *: http, https, xmpp', or 'href: mailto, http, https, xmpp; *: http, https, xmpp', or '*: *', and so on. The consequence of each of these example values will be different (e.g., only the last two but not the first will allow 'xmpp' in 'href') 1096 Thus, `to allow the xmpp scheme`, one can set '$config["schemes"]' as 'href: mailto, http, https; *: http, https, xmpp', or 'href: mailto, http, https, xmpp; *: http, https, xmpp', or '*: *', and so on. The consequence of each of these example values will be different (e.g., only the last two but not the first will allow 'xmpp' in 'href')
1097 1097
1098 As a side-note, one may find 'style: *' useful as URLs in 'style' attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text. 1098 As a side-note, one may find 'style: *' useful as URLs in 'style' attributes can be specified in a variety of ways, and the patterns that htmLawed uses to identify URLs may mistakenly identify non-URL text.
1099 1099
1100 '!' can be put in the list of schemes to disallow all protocols as well as `local` URLs. Thus, with 'href: http, style: !', '<a href="http://cnn.com" style="background-image: url(local.jpg);">CNN</a>' will become '<a href="http://cnn.com" style="background-image: url(denied:local.jpg);">CNN</a>' 1100 '!' can be put in the list of schemes to disallow all protocols as well as `local` URLs. Thus, with 'href: http, style: !', '<a href="http://cnn.com" style="background-image: url(local.jpg);">CNN</a>' will become '<a href="http://cnn.com" style="background-image: url(denied:local.jpg);">CNN</a>'
1101 1101
1102 *Note*: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string 'src' (e.g., 'dynsrc') or starts with 'o' (e.g., 'onbeforecopy'). 1102 *Note*: If URL-accepting attributes other than those listed above are being allowed, then the scheme will not be checked unless the attribute name contains the string 'src' (e.g., 'dynsrc') or starts with 'o' (e.g., 'onbeforecopy').
1103 1103
1104 With '$config["safe"] = 1', all URLs are disallowed in the 'style' attribute values. 1104 With '$config["safe"] = 1', all URLs are disallowed in the 'style' attribute values.
1105 1105
1106 1106
1107.. 3.4.4 Absolute & relative URLs in attribute values ............o 1107.. 3.4.4 Absolute & relative URLs in attribute values ............o
1108 1108
1109 1109
1110 htmLawed can make absolute URLs in attributes like 'href' relative ('$config["abs_url"]' is '-1'), and vice versa ('$config["abs_url"]' is '1'). URLs in scripts are not considered for this, and so are URLs like '#section_6' (fragment), '?name=Tim#show' (starting with query string), and ';var=1?name=Tim#show' (starting with parameters). Further, this requires that '$config["base_url"]' be set properly, with the '://' and a trailing slash ('/'), with no query string, etc. E.g., 'file:///D:/page/', 'https://abc.com/x/y/', or 'http://localhost/demo/' are okay, but 'file:///D:/page/?help=1', 'abc.com/x/y/' and 'http://localhost/demo/index.htm' are not. 1110 htmLawed can make absolute URLs in attributes like 'href' relative ('$config["abs_url"]' is '-1'), and vice versa ('$config["abs_url"]' is '1'). URLs in scripts are not considered for this, and so are URLs like '#section_6' (fragment), '?name=Tim#show' (starting with query string), and ';var=1?name=Tim#show' (starting with parameters). Further, this requires that '$config["base_url"]' be set properly, with the '://' and a trailing slash ('/'), with no query string, etc. E.g., 'file:///D:/page/', 'https://abc.com/x/y/', or 'http://localhost/demo/' are okay, but 'file:///D:/page/?help=1', 'abc.com/x/y/' and 'http://localhost/demo/index.htm' are not.
1111 1111
1112 For making absolute URLs relative, only those URLs that have the '$config["base_url"]' string at the beginning are converted. E.g., with '$config["base_url"] = "https://abc.com/x/y/"', 'https://abc.com/x/y/a.gif' and 'https://abc.com/x/y/z/b.gif' become 'a.gif' and 'z/b.gif' respectively, while 'https://abc.com/x/c.gif' is not changed. 1112 For making absolute URLs relative, only those URLs that have the '$config["base_url"]' string at the beginning are converted. E.g., with '$config["base_url"] = "https://abc.com/x/y/"', 'https://abc.com/x/y/a.gif' and 'https://abc.com/x/y/z/b.gif' become 'a.gif' and 'z/b.gif' respectively, while 'https://abc.com/x/c.gif' is not changed.
1113 1113
1114 When making relative URLs absolute, only values for scheme, network location (host-name) and path values in the base URL are inherited. See section:- #5.5 for more about the URL specification as per RFC 1808:- http://www.ietf.org/rfc/rfc1808.txt. 1114 When making relative URLs absolute, only values for scheme, network location (host-name) and path values in the base URL are inherited. See section:- #5.5 for more about the URL specification as per RFC 1808:- http://www.ietf.org/rfc/rfc1808.txt.
1115 1115
1116 1116
1117.. 3.4.5 Lower-cased, standard attribute values ...................o 1117.. 3.4.5 Lower-cased, standard attribute values ...................o
1118 1118
1119 1119
1120 Optionally, for standard-compliance, htmLawed (function 'hl_tag()') lower-cases standard attribute values to give, e.g., 'input type="password"' instead of 'input type="Password"', if '$config["lc_std_val"]' is '1'. Attribute values matching those listed below for any of the elements listed further below (plus those for the 'type' attribute of 'button' or 'input') are lower-cased: 1120 Optionally, for standard-compliance, htmLawed (function 'hl_tag()') lower-cases standard attribute values to give, e.g., 'input type="password"' instead of 'input type="Password"', if '$config["lc_std_val"]' is '1'. Attribute values matching those listed below for any of the elements listed further below (plus those for the 'type' attribute of 'button' or 'input') are lower-cased:
1121 1121
1122 all, auto, baseline, bottom, button, captions, center, chapters, char, checkbox, circle, col, colgroup, color, cols, data, date, datetime, datetime-local, default, descriptions, email, file, get, groups, hidden, image, justify, left, ltr, metadata, middle, month, none, number, object, password, poly, post, preserve, radio, range, rect, ref, reset, right, row, rowgroup, rows, rtl, search, submit, subtitles, tel, text, time, top, url, week 1122 all, auto, baseline, bottom, button, captions, center, chapters, char, checkbox, circle, col, colgroup, color, cols, data, date, datetime, datetime-local, default, descriptions, email, file, get, groups, hidden, image, justify, left, ltr, metadata, middle, month, none, number, object, password, poly, post, preserve, radio, range, rect, ref, reset, right, row, rowgroup, rows, rtl, search, submit, subtitles, tel, text, time, top, url, week
1123 1123
1124 a, area, bdo, button, col, fieldset, form, img, input, object, ol, optgroup, option, param, script, select, table, td, textarea, tfoot, th, thead, tr, track, xml:space 1124 a, area, bdo, button, col, fieldset, form, img, input, object, ol, optgroup, option, param, script, select, table, td, textarea, tfoot, th, thead, tr, track, xml:space
1125 1125
1126 The following `empty` (`minimized`) attributes are always assigned lower-cased values (same as the attribute names): 1126 The following `empty` (`minimized`) attributes are always assigned lower-cased values (same as the attribute names):
1127 1127
1128 checkbox, checked, command, compact, declare, defer, default, disabled, hidden, inert, ismap, itemscope, multiple, nohref, noresize, noshade, nowrap, open, radio, readonly, required, reversed, selected 1128 checkbox, checked, command, compact, declare, defer, default, disabled, hidden, inert, ismap, itemscope, multiple, nohref, noresize, noshade, nowrap, open, radio, readonly, required, reversed, selected
1129 1129
1130 1130
1131.. 3.4.6 Transformation of deprecated attributes ..................o 1131.. 3.4.6 Transformation of deprecated attributes ..................o
1132 1132
1133 1133
1134 If '$config["no_deprecated_attr"]' is '0', then deprecated attributes are removed and, in most cases, their values are transformed to CSS style properties and added to the 'style' attributes (function 'hl_tag()'). Except for 'bordercolor' for 'table', 'tr' and 'td', the scores of proprietary attributes that were never part of any cross-browser standard are not supported in this functionality. 1134 If '$config["no_deprecated_attr"]' is '0', then deprecated attributes are removed and, in most cases, their values are transformed to CSS style properties and added to the 'style' attributes (function 'hl_tag()'). Except for 'bordercolor' for 'table', 'tr' and 'td', the scores of proprietary attributes that were never part of any cross-browser standard are not supported in this functionality.
1135 1135
1136 * align in caption, div, h, h2, h3, h4, h5, h6, hr, img, input, legend, object, p, table - for 'img' with value of 'left' or 'right', becomes, e.g., 'float: left'; for 'div' and 'table' with value 'center', becomes 'margin: auto'; all others become, e.g., 'text-align: right' 1136 * align in caption, div, h, h2, h3, h4, h5, h6, hr, img, input, legend, object, p, table - for 'img' with value of 'left' or 'right', becomes, e.g., 'float: left'; for 'div' and 'table' with value 'center', becomes 'margin: auto'; all others become, e.g., 'text-align: right'
1137 * bgcolor in table, td, th and tr - E.g., 'bgcolor="#ffffff"' becomes 'background-color: #ffffff' 1137 * bgcolor in table, td, th and tr - E.g., 'bgcolor="#ffffff"' becomes 'background-color: #ffffff'
1138 * border in object - E.g., 'height="10"' becomes 'height: 10px' 1138 * border in object - E.g., 'height="10"' becomes 'height: 10px'
1139 * bordercolor in table, td and tr - E.g., 'bordercolor=#999999' becomes 'border-color: #999999;' 1139 * bordercolor in table, td and tr - E.g., 'bordercolor=#999999' becomes 'border-color: #999999;'
1140 * compact in dl, ol and ul - 'font-size: 85%' 1140 * compact in dl, ol and ul - 'font-size: 85%'
1141 * cellspacing in table - 'cellspacing="10"' becomes 'border-spacing: 10px' 1141 * cellspacing in table - 'cellspacing="10"' becomes 'border-spacing: 10px'
1142 * clear in br - E.g., 'clear="all" becomes 'clear: both' 1142 * clear in br - E.g., 'clear="all" becomes 'clear: both'
1143 * height in td and th - E.g., 'height= "10"' becomes 'height: 10px' and 'height="*"' becomes 'height: auto' 1143 * height in td and th - E.g., 'height= "10"' becomes 'height: 10px' and 'height="*"' becomes 'height: auto'
1144 * hspace in img and object - E.g., 'hspace="10"' becomes 'margin-left: 10px; margin-right: 10px' 1144 * hspace in img and object - E.g., 'hspace="10"' becomes 'margin-left: 10px; margin-right: 10px'
1145 * language in script - 'language="VBScript"' becomes 'type="text/vbscript"' 1145 * language in script - 'language="VBScript"' becomes 'type="text/vbscript"'
1146 * name in a, form, iframe, img and map - E.g., 'name="xx"' becomes 'id="xx"' 1146 * name in a, form, iframe, img and map - E.g., 'name="xx"' becomes 'id="xx"'
1147 * noshade in hr - 'border-style: none; border: 0; background-color: gray; color: gray' 1147 * noshade in hr - 'border-style: none; border: 0; background-color: gray; color: gray'
1148 * nowrap in td and th - 'white-space: nowrap' 1148 * nowrap in td and th - 'white-space: nowrap'
1149 * size in hr - E.g., 'size="10"' becomes 'height: 10px' 1149 * size in hr - E.g., 'size="10"' becomes 'height: 10px'
1150 * vspace in img and object - E.g., 'vspace="10"' becomes 'margin-top: 10px; margin-bottom: 10px' 1150 * vspace in img and object - E.g., 'vspace="10"' becomes 'margin-top: 10px; margin-bottom: 10px'
1151 * width in hr, pre, table, td and th - like 'height' 1151 * width in hr, pre, table, td and th - like 'height'
1152 1152
1153 Example input: 1153 Example input:
1154 1154
1155 <img src="j.gif" alt="image" name="dad's" /><img src="k.gif" alt="image" id="dad_off" name="dad" /> 1155 <img src="j.gif" alt="image" name="dad's" /><img src="k.gif" alt="image" id="dad_off" name="dad" />
1156 <br clear="left" /> 1156 <br clear="left" />
1157 <hr noshade size="1" /> 1157 <hr noshade size="1" />
1158 <img name="img" src="i.gif" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding:5px;" /> 1158 <img name="img" src="i.gif" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding:5px;" />
1159 <table width="50em" align="center" bgcolor="red"> 1159 <table width="50em" align="center" bgcolor="red">
1160 <tr> 1160 <tr>
1161 <td width="20%"> 1161 <td width="20%">
1162 <div align="center"> 1162 <div align="center">
1163 <h3 align="right">Section</h3> 1163 <h3 align="right">Section</h3>
1164 <p align="right">Para</p> 1164 <p align="right">Para</p>
1165 </div> 1165 </div>
1166 </td> 1166 </td>
1167 <td width="*"> 1167 <td width="*">
1168 </td> 1168 </td>
1169 </tr> 1169 </tr>
1170 </table> 1170 </table>
1171 <br clear="all" /> 1171 <br clear="all" />
1172 1172
1173 And the output with '$config["no_deprecated_attr"] = 1': 1173 And the output with '$config["no_deprecated_attr"] = 1':
1174 1174
1175 <img src="j.gif" alt="image" id="dad's" /><img src="k.gif" alt="image" id="dad_off" /> 1175 <img src="j.gif" alt="image" id="dad's" /><img src="k.gif" alt="image" id="dad_off" />
1176 <br style="clear: left;" /> 1176 <br style="clear: left;" />
1177 <hr style="border-style: none; border: 0; background-color: gray; color: gray; size: 1px;" /> 1177 <hr style="border-style: none; border: 0; background-color: gray; color: gray; size: 1px;" />
1178 <img src="i.gif" alt="image" width="10em" height="20" style="padding:5px; float: left; margin-left: 10px; margin-right: 10px; margin-top: 10px; margin-bottom: 10px; border: 1px;" id="img" /> 1178 <img src="i.gif" alt="image" width="10em" height="20" style="padding:5px; float: left; margin-left: 10px; margin-right: 10px; margin-top: 10px; margin-bottom: 10px; border: 1px;" id="img" />
1179 <table width="50em" style="margin: auto; background-color: red;"> 1179 <table width="50em" style="margin: auto; background-color: red;">
1180 <tr> 1180 <tr>
1181 <td style="width: 20%;"> 1181 <td style="width: 20%;">
1182 <div style="margin: auto;"> 1182 <div style="margin: auto;">
1183 <h3 style="text-align: right;">Section</h3> 1183 <h3 style="text-align: right;">Section</h3>
1184 <p style="text-align: right;">Para</p> 1184 <p style="text-align: right;">Para</p>
1185 </div> 1185 </div>
1186 </td> 1186 </td>
1187 <td style="width: auto;"> 1187 <td style="width: auto;">
1188 </td> 1188 </td>
1189 </tr> 1189 </tr>
1190 </table> 1190 </table>
1191 <br style="clear: both;" /> 1191 <br style="clear: both;" />
1192 1192
1193 For 'lang', deprecated in XHTML 1.1, transformation is taken care of through '$config["xml:lang"]'; see section:- #3.4.1. 1193 For 'lang', deprecated in XHTML 1.1, transformation is taken care of through '$config["xml:lang"]'; see section:- #3.4.1.
1194 1194
1195 The attribute 'name' is deprecated in 'form', 'iframe', and 'img', and is replaced with 'id' if an 'id' attribute doesn't exist and if the 'name' value is appropriate for 'id' (i.e., doesn't have a non-word character like space). For such replacements for 'a' and 'map', for which the 'name' attribute is deprecated in XHTML 1.1, '$config["no_deprecated_attr"]' should be set to '2' (when set to '1', for these two elements, the 'name' attribute is retained). 1195 The attribute 'name' is deprecated in 'form', 'iframe', and 'img', and is replaced with 'id' if an 'id' attribute doesn't exist and if the 'name' value is appropriate for 'id' (i.e., doesn't have a non-word character like space). For such replacements for 'a' and 'map', for which the 'name' attribute is deprecated in XHTML 1.1, '$config["no_deprecated_attr"]' should be set to '2' (when set to '1', for these two elements, the 'name' attribute is retained).
1196 1196
1197 1197
1198.. 3.4.7 Anti-spam & 'href' .......................................o 1198.. 3.4.7 Anti-spam & 'href' .......................................o
1199 1199
1200 1200
1201 htmLawed (function 'hl_tag()') can check the 'href' attribute values (link addresses) as an anti-spam (email or link spam) measure. 1201 htmLawed (function 'hl_tag()') can check the 'href' attribute values (link addresses) as an anti-spam (email or link spam) measure.
1202 1202
1203 If '$config["anti_mail_spam"]' is not '0', the '@' of email addresses in 'href' values like 'mailto:a@b.com' is replaced with text specified by '$config["anti_mail_spam"]'. The text should be of a form that makes it clear to others that the address needs to be edited before a mail is sent; e.g., '<remove_this_antispam>@' (makes the example address 'a<remove_this_antispam>@b.com'). 1203 If '$config["anti_mail_spam"]' is not '0', the '@' of email addresses in 'href' values like 'mailto:a@b.com' is replaced with text specified by '$config["anti_mail_spam"]'. The text should be of a form that makes it clear to others that the address needs to be edited before a mail is sent; e.g., '<remove_this_antispam>@' (makes the example address 'a<remove_this_antispam>@b.com').
1204 1204
1205 For regular links, one can choose to have a 'rel' attribute with 'nofollow' in its value (which tells some search engines to not follow a link). This can discourage link spammers. Additionally, or as an alternative, one can choose to empty the 'href' value altogether (disable the link). 1205 For regular links, one can choose to have a 'rel' attribute with 'nofollow' in its value (which tells some search engines to not follow a link). This can discourage link spammers. Additionally, or as an alternative, one can choose to empty the 'href' value altogether (disable the link).
1206 1206
1207 For use of these options, '$config["anti_link_spam"]' should be set as an array with values 'regex1' and 'regex2', both or one of which can be empty (like 'array("", "regex2")') to indicate that that option is not to be used. Otherwise, 'regex1' or 'regex2' should be PHP- and PCRE-compatible regular expression patterns: 'href' values will be matched against them and those matching the pattern will accordingly be treated. 1207 For use of these options, '$config["anti_link_spam"]' should be set as an array with values 'regex1' and 'regex2', both or one of which can be empty (like 'array("", "regex2")') to indicate that that option is not to be used. Otherwise, 'regex1' or 'regex2' should be PHP- and PCRE-compatible regular expression patterns: 'href' values will be matched against them and those matching the pattern will accordingly be treated.
1208 1208
1209 Note that the regular expressions should have `delimiters`, and be well-formed and preferably fast. Absolute efficiency/accuracy is often not needed. 1209 Note that the regular expressions should have `delimiters`, and be well-formed and preferably fast. Absolute efficiency/accuracy is often not needed.
1210 1210
1211 An example, to have a 'rel' attribute with 'nofollow' for all links, and to disable links that do not point to domains 'abc.com' and 'xyz.org': 1211 An example, to have a 'rel' attribute with 'nofollow' for all links, and to disable links that do not point to domains 'abc.com' and 'xyz.org':
1212 1212
1213 $config["anti_link_spam"] = array('`.`', '`://\W*(?!(abc\.com|xyz\.org))`'); 1213 $config["anti_link_spam"] = array('`.`', '`://\W*(?!(abc\.com|xyz\.org))`');
1214 1214
1215 1215
1216.. 3.4.8 Inline style properties ..................................o 1216.. 3.4.8 Inline style properties ..................................o
1217 1217
1218 1218
1219 htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the 'style' attributes. (CSS properties like 'background-image' that accept URLs in their values are noted in section:- #5.3.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting '$config["css_expression"]' to '1' (default setting). Note that when '$config["css_expression"]' is set to '1', htmLawed will remove '/*' from the 'style' values. 1219 htmLawed can check URL schemes and dynamic expressions (to guard against Javascript, etc., script-based insecurities) in inline CSS style property values in the 'style' attributes. (CSS properties like 'background-image' that accept URLs in their values are noted in section:- #5.3.) Dynamic CSS expressions that allow scripting in the IE browser, and can be a vulnerability, can be removed from property values by setting '$config["css_expression"]' to '1' (default setting). Note that when '$config["css_expression"]' is set to '1', htmLawed will remove '/*' from the 'style' values.
1220 1220
1221 *Note*: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the 'style' attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off ('$config["schemes"] = "...style:*..."', see section:- #3.4.3, and '$config["css_expression"] = 0'). Alternately, admins can use their own custom function for finer handling of 'style' values through the 'hook_tag' parameter (see section:- #3.4.9). 1221 *Note*: Because of the various ways of representing characters in attribute values (URL-escapement, entitification, etc.), htmLawed might alter the values of the 'style' attribute values, and may even falsely identify dynamic CSS expressions and URL schemes in them. If this is an important issue, checking of URLs and dynamic expressions can be turned off ('$config["schemes"] = "...style:*..."', see section:- #3.4.3, and '$config["css_expression"] = 0'). Alternately, admins can use their own custom function for finer handling of 'style' values through the 'hook_tag' parameter (see section:- #3.4.9).
1222 1222
1223 It is also possible to have htmLawed let through any 'style' value by setting '$config["style_pass"]' to '1'. 1223 It is also possible to have htmLawed let through any 'style' value by setting '$config["style_pass"]' to '1'.
1224 1224
1225 As such, it is better to set up a CSS file with class declarations, disallow the 'style' attribute, set a '$spec' rule (see section:- #2.3) for 'class' for the 'oneof' or 'match' parameter, and ask writers to make use of the 'class' attribute. 1225 As such, it is better to set up a CSS file with class declarations, disallow the 'style' attribute, set a '$spec' rule (see section:- #2.3) for 'class' for the 'oneof' or 'match' parameter, and ask writers to make use of the 'class' attribute.
1226 1226
1227 1227
1228.. 3.4.9 Hook function for tag content ............................o 1228.. 3.4.9 Hook function for tag content ............................o
1229 1229
1230 1230
1231 It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.). 1231 It is possible to utilize a custom hook function to alter the tag content htmLawed has finalized (i.e., after it has checked/corrected for required attributes, transformed attributes, lower-cased attribute names, etc.).
1232 1232
1233 When '$config' parameter 'hook_tag' is set to the name of a function, htmLawed (function 'hl_tag()') will pass on the element name, and the `finalized` attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like '<element_name attribute_1_name="attribute_1_value"...>' (for empty elements like 'img' and 'input', the element-closing slash '/' should also be included), etc. 1233 When '$config' parameter 'hook_tag' is set to the name of a function, htmLawed (function 'hl_tag()') will pass on the element name, and the `finalized` attribute name-value pairs as array elements to the function. The function, after completing a task such as filtering or tag transformation, will typically return an empty string, the full opening tag string like '<element_name attribute_1_name="attribute_1_value"...>' (for empty elements like 'img' and 'input', the element-closing slash '/' should also be included), etc.
1234 1234
1235 Any 'hook_tag' function, since htmLawed version 1.1.11, also receives names of elements in closing tags, such as 'a' in the closing '</a>' tag of the element '<a href="http://cnn.com">CNN</a>'. No other value is passed to the function since a closing tag contains only element names. Typically, the function will return an empty string or a full closing tag (like '</a>'). 1235 Any 'hook_tag' function, since htmLawed version 1.1.11, also receives names of elements in closing tags, such as 'a' in the closing '</a>' tag of the element '<a href="http://cnn.com">CNN</a>'. No other value is passed to the function since a closing tag contains only element names. Typically, the function will return an empty string or a full closing tag (like '</a>').
1236 1236
1237 This is a *powerful functionality* that can be exploited for various objectives: consolidate-and-convert inline 'style' attributes to 'class', convert 'embed' elements to 'object', permit only one 'caption' element in a 'table' element, disallow embedding of certain types of media, *inject HTML*, use CSSTidy:- http://csstidy.sourceforge.net to sanitize 'style' attribute values, etc. 1237 This is a *powerful functionality* that can be exploited for various objectives: consolidate-and-convert inline 'style' attributes to 'class', convert 'embed' elements to 'object', permit only one 'caption' element in a 'table' element, disallow embedding of certain types of media, *inject HTML*, use CSSTidy:- http://csstidy.sourceforge.net to sanitize 'style' attribute values, etc.
1238 1238
1239 As an example, the custom hook code below can be used to force a series of specifically ordered 'id' attributes on all elements, and a specific 'param' element inside all 'object' elements: 1239 As an example, the custom hook code below can be used to force a series of specifically ordered 'id' attributes on all elements, and a specific 'param' element inside all 'object' elements:
1240 1240
1241 function my_tag_function($element, $attribute_array=0){ 1241 function my_tag_function($element, $attribute_array=0){
1242 1242
1243 // If second argument is not received, it means a closing tag is being handled 1243 // If second argument is not received, it means a closing tag is being handled
1244 if(is_numeric($attribute_array)){ 1244 if(is_numeric($attribute_array)){
1245 return "</$element>"; 1245 return "</$element>";
1246 } 1246 }
1247 1247
1248 static $id = 0; 1248 static $id = 0;
1249 // Remove any duplicate element 1249 // Remove any duplicate element
1250 if($element == 'param' && isset($attribute_array['allowscriptaccess'])){ 1250 if($element == 'param' && isset($attribute_array['allowscriptaccess'])){
1251 return ''; 1251 return '';
1252 } 1252 }
1253 1253
1254 $new_element = ''; 1254 $new_element = '';
1255 1255
1256 // Force a serialized ID number 1256 // Force a serialized ID number
1257 $attribute_array['id'] = 'my_'. $id; 1257 $attribute_array['id'] = 'my_'. $id;
1258 ++$id; 1258 ++$id;
1259 1259
1260 // Inject param for allowscriptaccess 1260 // Inject param for allowscriptaccess
1261 if($element == 'object'){ 1261 if($element == 'object'){
1262 $new_element = '<param id="my_'. $id. '"; allowscriptaccess="never" />'; 1262 $new_element = '<param id="my_'. $id. '"; allowscriptaccess="never" />';
1263 ++$id; 1263 ++$id;
1264 } 1264 }
1265 1265
1266 $string = ''; 1266 $string = '';
1267 foreach($attribute_array as $k=>$v){ 1267 foreach($attribute_array as $k=>$v){
1268 $string .= " {$k}=\"{$v}\""; 1268 $string .= " {$k}=\"{$v}\"";
1269 } 1269 }
1270 1270
1271 static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1); 1271 static $empty_elements = array('area'=>1, 'br'=>1, 'col'=>1, 'command'=>1, 'embed'=>1, 'hr'=>1, 'img'=>1, 'input'=>1, 'isindex'=>1, 'keygen'=>1, 'link'=>1, 'meta'=>1, 'param'=>1, 'source'=>1, 'track'=>1, 'wbr'=>1);
1272 1272
1273 return "<{$element}{$string}". (array_key_exists($element, $empty_elements) ? ' /' : ''). '>'. $new_element; 1273 return "<{$element}{$string}". (array_key_exists($element, $empty_elements) ? ' /' : ''). '>'. $new_element;
1274 } 1274 }
1275 1275
1276 The 'hook_tag' parameter is different from the 'hook' parameter (section:- #3.7). 1276 The 'hook_tag' parameter is different from the 'hook' parameter (section:- #3.7).
1277 1277
1278 Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website. 1278 Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website.
1279 1279
1280 1280
1281-- 3.5 Simple configuration directive for most valid XHTML --------o 1281-- 3.5 Simple configuration directive for most valid XHTML --------o
1282 1282
1283 1283
1284 If '$config["valid_xhtml"]' is set to '1', some relevant '$config' parameters (indicated by '~' in section:- #2.2) are auto-adjusted. This allows one to pass the '$config' argument with a simpler value. If a value for a parameter auto-set through 'valid_xhtml' is still manually provided, then that value will over-ride the auto-set value. 1284 If '$config["valid_xhtml"]' is set to '1', some relevant '$config' parameters (indicated by '~' in section:- #2.2) are auto-adjusted. This allows one to pass the '$config' argument with a simpler value. If a value for a parameter auto-set through 'valid_xhtml' is still manually provided, then that value will over-ride the auto-set value.
1285 1285
1286 1286
1287-- 3.6 Simple configuration directive for most `safe` HTML --------o 1287-- 3.6 Simple configuration directive for most `safe` HTML --------o
1288 1288
1289 1289
1290 `Safe` HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specifications. When elements such as 'script' and 'object', and attributes such as 'onmouseover' and 'style' are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered 'safe' depends on the nature of the web application and the trust-level accorded to its users. 1290 `Safe` HTML refers to HTML that is restricted to reduce the vulnerability for scripting attacks (such as XSS) based on HTML code which otherwise may still be legal and compliant with the HTML standard specifications. When elements such as 'script' and 'object', and attributes such as 'onmouseover' and 'style' are allowed in the input text, an input writer can introduce malevolent HTML code. Note that what is considered 'safe' depends on the nature of the web application and the trust-level accorded to its users.
1291 1291
1292 htmLawed allows an admin to use '$config["safe"]' to auto-adjust multiple '$config' parameters (such as 'elements' which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by '"' in section:- #2.2). Thus, one can pass the '$config' argument with a simpler value. Having the 'safe' parameter set to '1' is equivalent to setting the following '$config' parameters to the noted values : 1292 htmLawed allows an admin to use '$config["safe"]' to auto-adjust multiple '$config' parameters (such as 'elements' which declares the allowed element-set), which otherwise would have to be manually set. The relevant parameters are indicated by '"' in section:- #2.2). Thus, one can pass the '$config' argument with a simpler value. Having the 'safe' parameter set to '1' is equivalent to setting the following '$config' parameters to the noted values :
1293 1293
1294 cdata - 0 1294 cdata - 0
1295 comment - 0 1295 comment - 0
1296 deny_attribute - on* 1296 deny_attribute - on*
1297 elements - * -applet -audio -canvas -embed -iframe -object -script -video 1297 elements - * -applet -audio -canvas -embed -iframe -object -script -video
1298 schemes - href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style: !; *:file, http, https 1298 schemes - href: aim, feed, file, ftp, gopher, http, https, irc, mailto, news, nntp, sftp, ssh, tel, telnet; style: !; *:file, http, https
1299 1299
1300 With 'safe' set to '1', htmLawed considers 'CDATA' sections and HTML comments as plain text, and prohibits the 'applet', 'audio', 'canvas', 'embed', 'iframe', 'object', 'script' and 'video' elements, and the 'on*' attributes like 'onclick'. ( There are '$config' parameters like 'css_expression' that are not affected by the value set for 'safe' but whose default values still contribute towards a more `safe` output.) Further, unless overridden by the value for parameter 'schemes' (see section:- #3.4.3), the schemes 'app', 'data' and 'javascript' are not permitted, and URLs with schemes are neutralized so that, e.g., 'style="moz-binding:url(http://danger)"' becomes 'style="moz-binding:url(denied:http://danger)"'. 1300 With 'safe' set to '1', htmLawed considers 'CDATA' sections and HTML comments as plain text, and prohibits the 'applet', 'audio', 'canvas', 'embed', 'iframe', 'object', 'script' and 'video' elements, and the 'on*' attributes like 'onclick'. ( There are '$config' parameters like 'css_expression' that are not affected by the value set for 'safe' but whose default values still contribute towards a more `safe` output.) Further, unless overridden by the value for parameter 'schemes' (see section:- #3.4.3), the schemes 'app', 'data' and 'javascript' are not permitted, and URLs with schemes are neutralized so that, e.g., 'style="moz-binding:url(http://danger)"' becomes 'style="moz-binding:url(denied:http://danger)"'.
1301 1301
1302 Admins, however, may still want to completely deny the 'style' attribute, e.g., with code like 1302 Admins, however, may still want to completely deny the 'style' attribute, e.g., with code like
1303 1303
1304 $processed = htmLawed($text, array('safe'=>1, 'deny_attribute'=>'style')); 1304 $processed = htmLawed($text, array('safe'=>1, 'deny_attribute'=>'style'));
1305 1305
1306 Permitting the 'style' attribute brings in risks of `click-jacking`, etc. CSS property values can render a page non-functional or be used to deface it. Except for URLs, dynamic expressions, and some other things, htmLawed does not completely check 'style' values. It does provide ways for the code-developer implementing htmLawed to do such checks through the '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended. 1306 Permitting the 'style' attribute brings in risks of `click-jacking`, etc. CSS property values can render a page non-functional or be used to deface it. Except for URLs, dynamic expressions, and some other things, htmLawed does not completely check 'style' values. It does provide ways for the code-developer implementing htmLawed to do such checks through the '$spec' argument, and through the 'hook_tag' parameter (see section:- #3.4.8 for more). Disallowing style completely and relying on CSS classes and stylesheet files is recommended.
1307 1307
1308 If a value for a parameter auto-set through 'safe' is still manually provided, then that value can over-ride the auto-set value. E.g., with '$config["safe"] = 1' and '$config["elements"] = "* +script"', 'script', but not 'applet', is allowed. Such over-ride does not occur for 'deny_attribute' (for legacy reason) when comma-separated attribute names are provided as the value for this parameter (section:- #3.4); instead htmLawed will add 'on*' to the value provided for 'deny_attribute'. 1308 If a value for a parameter auto-set through 'safe' is still manually provided, then that value can over-ride the auto-set value. E.g., with '$config["safe"] = 1' and '$config["elements"] = "* +script"', 'script', but not 'applet', is allowed. Such over-ride does not occur for 'deny_attribute' (for legacy reason) when comma-separated attribute names are provided as the value for this parameter (section:- #3.4); instead htmLawed will add 'on*' to the value provided for 'deny_attribute'.
1309 1309
1310 A page illustrating the efficacy of htmLawed's anti-XSS abilities with 'safe' set to '1' against XSS vectors listed by RSnake:- http://ha.ckers.org/xss.html may be available here:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm. 1310 A page illustrating the efficacy of htmLawed's anti-XSS abilities with 'safe' set to '1' against XSS vectors listed by RSnake:- http://ha.ckers.org/xss.html may be available here:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed/rsnake/RSnakeXSSTest.htm.
1311 1311
1312 1312
1313-- 3.7 Using a hook function --------------------------------------o 1313-- 3.7 Using a hook function --------------------------------------o
1314 1314
1315 1315
1316 If '$config["hook"]' is not set to '0', then htmLawed will allow preliminarily processed input to be altered by a hook function named by '$config["hook"]' before starting the main work (but after handling of characters, entities, HTML comments and 'CDATA' sections -- see code for function 'htmLawed()'). 1316 If '$config["hook"]' is not set to '0', then htmLawed will allow preliminarily processed input to be altered by a hook function named by '$config["hook"]' before starting the main work (but after handling of characters, entities, HTML comments and 'CDATA' sections -- see code for function 'htmLawed()').
1317 1317
1318 The hook function also allows one to alter the `finalized` values of '$config' and '$spec'. 1318 The hook function also allows one to alter the `finalized` values of '$config' and '$spec'.
1319 1319
1320 Note that the 'hook' parameter is different from the 'hook_tag' parameter (section:- #3.4.9). 1320 Note that the 'hook' parameter is different from the 'hook_tag' parameter (section:- #3.4.9).
1321 1321
1322 Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website. 1322 Snippets of hook function code developed by others may be available on the htmLawed:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed website.
1323 1323
1324 1324
1325-- 3.8 Obtaining `finalized` parameter values ---------------------o 1325-- 3.8 Obtaining `finalized` parameter values ---------------------o
1326 1326
1327 1327
1328 htmLawed can assign the `finalized` '$config' and '$spec' values to a variable named by '$config["show_setting"]'. The variable, made global by htmLawed, is set as an array with three keys: 'config', with the '$config' value, 'spec', with the '$spec' value, and 'time', with a value that is the Unix time (the output of PHP's 'microtime()' function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code. 1328 htmLawed can assign the `finalized` '$config' and '$spec' values to a variable named by '$config["show_setting"]'. The variable, made global by htmLawed, is set as an array with three keys: 'config', with the '$config' value, 'spec', with the '$spec' value, and 'time', with a value that is the Unix time (the output of PHP's 'microtime()' function) when the value was assigned. Admins should use a PHP-compliant variable name (e.g., one that does not begin with a numerical digit) that does not conflict with variable names in their non-htmLawed code.
1329 1329
1330 The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers. 1330 The values, which are also post-hook function (if any), can be used to auto-generate information (on, e.g., the elements that are permitted) for input writers.
1331 1331
1332 1332
1333-- 3.9 Retaining non-HTML tags in input with mixed markup ---------o 1333-- 3.9 Retaining non-HTML tags in input with mixed markup ---------o
1334 1334
1335 1335
1336 htmLawed does not remove certain characters that, though invalid, are nevertheless `discouraged` in HTML documents as per the specifications (see section:- #5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the '<', '>' and '&' characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code). 1336 htmLawed does not remove certain characters that, though invalid, are nevertheless `discouraged` in HTML documents as per the specifications (see section:- #5.1). This can be utilized to deal with input that contains mixed markup. Input that may have HTML markup as well as some other markup that is based on the '<', '>' and '&' characters is considered to have mixed markup. The non-HTML markup can be rather proprietary (like markup for emoticons/smileys), or standard (like MathML or SVG). Or it can be programming code meant for execution/evaluation (such as embedded PHP code).
1337 1337
1338 To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the '<', '>' and '&' characters with some of the HTML-discouraged characters (see section:- #3.1.2). Post-htmLawed processing, the replacements are reverted. 1338 To deal with such mixed markup, the input text can be pre-processed to hide the non-HTML markup by specifically replacing the '<', '>' and '&' characters with some of the HTML-discouraged characters (see section:- #3.1.2). Post-htmLawed processing, the replacements are reverted.
1339 1339
1340 An example (mixed HTML and PHP code in input text): 1340 An example (mixed HTML and PHP code in input text):
1341 1341
1342 $text = preg_replace('`<\?php(.+?)\?>`sm', "\x83?php\\1?\x84", $text); 1342 $text = preg_replace('`<\?php(.+?)\?>`sm', "\x83?php\\1?\x84", $text);
1343 $processed = htmLawed($text); 1343 $processed = htmLawed($text);
1344 $processed = preg_replace('`\x83\?php(.+?)\?\x84`sm', '<?php$1?>', $processed); 1344 $processed = preg_replace('`\x83\?php(.+?)\?\x84`sm', '<?php$1?>', $processed);
1345 1345
1346 This code will not work if '$config["clean_ms_char"]' is set to '1' (section:- #3.1), in which case one should instead deploy a hook function (section:- #3.7). (htmLawed internally uses certain control characters, code-points '1' to '7', and use of these characters as markers in the logic of hook functions may cause issues.) 1346 This code will not work if '$config["clean_ms_char"]' is set to '1' (section:- #3.1), in which case one should instead deploy a hook function (section:- #3.7). (htmLawed internally uses certain control characters, code-points '1' to '7', and use of these characters as markers in the logic of hook functions may cause issues.)
1347 1347
1348 Admins may also be able to use '$config["and_mark"]' to deal with such mixed markup; see section:- #3.2. 1348 Admins may also be able to use '$config["and_mark"]' to deal with such mixed markup; see section:- #3.2.
1349 1349
1350 1350
1351== 4 Other =======================================================oo 1351== 4 Other =======================================================oo
1352 1352
1353 1353
1354-- 4.1 Support ----------------------------------------------------- 1354-- 4.1 Support -----------------------------------------------------
1355 1355
1356 1356
1357 Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net. 1357 Software updates and forum-based community-support may be found at http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. For general PHP issues (not htmLawed-specific), support may be found through internet searches and at http://php.net.
1358 1358
1359 1359
1360-- 4.2 Known issues -----------------------------------------------o 1360-- 4.2 Known issues -----------------------------------------------o
1361 1361
1362 1362
1363 See section:- #2.8. 1363 See section:- #2.8.
1364 1364
1365 1365
1366-- 4.3 Change-log -------------------------------------------------o 1366-- 4.3 Change-log -------------------------------------------------o
1367 1367
1368 1368
1369 (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the 'htmLawed.php' file, may be updated without a change-log entry if the secondary files, but not htmLawed per se, are revised.) 1369 (The release date for the downloadable package of files containing documentation, demo script, test-cases, etc., besides the 'htmLawed.php' file, may be updated without a change-log entry if the secondary files, but not htmLawed per se, are revised.)
1370 1370
1371 `Version number - Release date. Notes` 1371 `Version number - Release date. Notes`
1372 1372
1373 1.2.5 - 24 September 2019. Fixes two bugs in 'font' tag transformation 1373 1.2.5 - 24 September 2019. Fixes two bugs in 'font' tag transformation
1374 1374
1375 1.2.4.2 - 16 May 2019. Corrects a PHP notice if a semi-colon is present in '$config["schemes"]' 1375 1.2.4.2 - 16 May 2019. Corrects a PHP notice if a semi-colon is present in '$config["schemes"]'
1376 1376
1377 1.2.4.1 - 12 September 2017. Corrects a function re-declaration bug introduced in version 1.2.4 1377 1.2.4.1 - 12 September 2017. Corrects a function re-declaration bug introduced in version 1.2.4
1378 1378
1379 1.2.4 - 31 August 2017. Removes use of PHP 'create_function' function and '$php_errormsg' reserved variable (deprecated in PHP 7.2) 1379 1.2.4 - 31 August 2017. Removes use of PHP 'create_function' function and '$php_errormsg' reserved variable (deprecated in PHP 7.2)
1380 1380
1381 1.2.3 - 5 July 2017. New option value of '4' for '$config["comments"]' to stop enforcing a space character before the '-->' comment-closing marker 1381 1.2.3 - 5 July 2017. New option value of '4' for '$config["comments"]' to stop enforcing a space character before the '-->' comment-closing marker
1382 1382
1383 1.2.2 - 25 May 2017. Fix for a bug in parsing '$spec' that got introduced in version 1.2; also, '$spec' is now parsed to accommodate specifications for an HTML element when they are specified in multiple rules 1383 1.2.2 - 25 May 2017. Fix for a bug in parsing '$spec' that got introduced in version 1.2; also, '$spec' is now parsed to accommodate specifications for an HTML element when they are specified in multiple rules
1384 1384
1385 1.2.1.1 - 17 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes 1385 1.2.1.1 - 17 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes
1386 1386
1387 1.2.1 - 15 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes 1387 1.2.1 - 15 May 2017. Fix for a potential security vulnerability in transformation of deprecated attributes
1388 1388
1389 1.2 - 11 February 2017. (First beta release on 26 May 2013). Added support for HTML version 5; ARIA, data-* and microdata attributes; 'app', 'data', 'javascript' and 'tel' URL schemes (thus, 'javascript:' is not filtered in default mode). Removed support for code using Kses functions (see section:- #2.6). Changes in revisions to the beta releases are not noted here. 1389 1.2 - 11 February 2017. (First beta release on 26 May 2013). Added support for HTML version 5; ARIA, data-* and microdata attributes; 'app', 'data', 'javascript' and 'tel' URL schemes (thus, 'javascript:' is not filtered in default mode). Removed support for code using Kses functions (see section:- #2.6). Changes in revisions to the beta releases are not noted here.
1390 1390
1391 1.1.22 - 5 March 2016. Improved testing of attribute value rules specified in '$spec' 1391 1.1.22 - 5 March 2016. Improved testing of attribute value rules specified in '$spec'
1392 1392
1393 1.1.21 - 27 February 2016. Improvement and security fix in transforming 'font' element 1393 1.1.21 - 27 February 2016. Improvement and security fix in transforming 'font' element
1394 1394
1395 1.1.20 - 9 June 2015. Fix for a potential security vulnerability arising from unescaped double-quote character in single-quoted attribute value of some deprecated elements when tag transformation is enabled; recognition for non-(HTML 4) standard 'allowfullscreen' attribute of 'iframe' 1395 1.1.20 - 9 June 2015. Fix for a potential security vulnerability arising from unescaped double-quote character in single-quoted attribute value of some deprecated elements when tag transformation is enabled; recognition for non-(HTML 4) standard 'allowfullscreen' attribute of 'iframe'
1396 1396
1397 1.1.19 - 19 January 2015. Fix for a bug in cleaning of soft-hyphens in URL values, etc 1397 1.1.19 - 19 January 2015. Fix for a bug in cleaning of soft-hyphens in URL values, etc
1398 1398
1399 1.1.18 - 2 August 2014. Fix for a potential security vulnerability arising from specially encoded text with serial opening tags 1399 1.1.18 - 2 August 2014. Fix for a potential security vulnerability arising from specially encoded text with serial opening tags
1400 1400
1401 1.1.17 - 11 March 2014. Removed use of PHP function preg_replace with 'e' modifier for compatibility with PHP 5.5. 1401 1.1.17 - 11 March 2014. Removed use of PHP function preg_replace with 'e' modifier for compatibility with PHP 5.5.
1402 1402
1403 1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specialy encoded space characters in URL schemes/protocols 1403 1.1.16 - 29 August 2013. Fix for a potential security vulnerability arising from specialy encoded space characters in URL schemes/protocols
1404 1404
1405 1.1.15 - 11 August 2013. Improved tidying/prettifying functionality 1405 1.1.15 - 11 August 2013. Improved tidying/prettifying functionality
1406 1406
1407 1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during 'tidying' when 'balance' is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like 'blockquote' 1407 1.1.14 - 8 August 2012. Fix for possible segmental loss of incremental indentation during 'tidying' when 'balance' is disabled; fix for non-effectuation under some circumstances of a corrective behavior to preserve plain text within elements like 'blockquote'
1408 1408
1409 1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes 1409 1.1.13 - 22 July 2012. Added feature allowing use of custom, non-standard attributes or custom rules for standard attributes
1410 1410
1411 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the 'face' attribute 1411 1.1.12 - 5 July 2012. Fix for a bug in identifying an unquoted value of the 'face' attribute
1412 1412
1413 1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. '$config["hook_tag"]', if specified, now receives names of elements in closing tags. 1413 1.1.11 - 5 June 2012. Fix for possible problem with handling of multi-byte characters in attribute values in an mbstring.func_overload enviroment. '$config["hook_tag"]', if specified, now receives names of elements in closing tags.
1414 1414
1415 1.1.10 - 22 October 2011. Fix for a bug in the 'tidy' functionality that caused the entire input to be replaced with a single space; new parameter, '$config["direct_list_nest"]' to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.) 1415 1.1.10 - 22 October 2011. Fix for a bug in the 'tidy' functionality that caused the entire input to be replaced with a single space; new parameter, '$config["direct_list_nest"]' to allow direct descendance of a list in a list. (5 April 2012. Dual licensing from LGPLv3 to LGPLv3 and GPLv2+.)
1416 1416
1417 1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of 'li' within 'dir' 1417 1.1.9.5 - 6 July 2011. Minor correction of a rule for nesting of 'li' within 'dir'
1418 1418
1419 1.1.9.4 - 3 July 2010. Parameter 'schemes' now accepts '!' so any URL, even a local one, can be `denied`. An issue in which a second URL value in 'style' properties was not checked was fixed. 1419 1.1.9.4 - 3 July 2010. Parameter 'schemes' now accepts '!' so any URL, even a local one, can be `denied`. An issue in which a second URL value in 'style' properties was not checked was fixed.
1420 1420
1421 1.1.9.3 - 17 May 2010. Checks for correct nesting of 'param' 1421 1.1.9.3 - 17 May 2010. Checks for correct nesting of 'param'
1422 1422
1423 1.1.9.2 - 26 April 2010. Minor fix regarding rendering of denied URL schemes 1423 1.1.9.2 - 26 April 2010. Minor fix regarding rendering of denied URL schemes
1424 1424
1425 1.1.9.1 - 26 February 2010. htmLawed now uses the LGPL version 3 license; support for 'flashvars' attribute for 'embed' 1425 1.1.9.1 - 26 February 2010. htmLawed now uses the LGPL version 3 license; support for 'flashvars' attribute for 'embed'
1426 1426
1427 1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values 1427 1.1.9 - 22 December 2009. Soft-hyphens are now removed only from URL-accepting attribute values
1428 1428
1429 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice 1429 1.1.8.1 - 16 July 2009. Minor code-change to fix a PHP error notice
1430 1430
1431 1.1.8 - 23 April 2009. Parameter 'deny_attribute' now accepts the wild-card '*', making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting '$spec' 1431 1.1.8 - 23 April 2009. Parameter 'deny_attribute' now accepts the wild-card '*', making it simpler to specify its value when all but a few attributes are being denied; fixed a bug in interpreting '$spec'
1432 1432
1433 1.1.7 - 11-12 March 2009. Attributes globally denied through 'deny_attribute' can be allowed element-specifically through '$spec'; '$config["style_pass"]' allowing letting through any 'style' value introduced; altered logic to catch certain types of dynamic crafted CSS expressions 1433 1.1.7 - 11-12 March 2009. Attributes globally denied through 'deny_attribute' can be allowed element-specifically through '$spec'; '$config["style_pass"]' allowing letting through any 'style' value introduced; altered logic to catch certain types of dynamic crafted CSS expressions
1434 1434
1435 1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions 1435 1.1.3-6 - 28-31 January - 4 February 2009. Altered logic to catch certain types of dynamic crafted CSS expressions
1436 1436
1437 1.1.2 - 22 January 2009. Fixed bug in parsing of 'font' attributes during tag transformation 1437 1.1.2 - 22 January 2009. Fixed bug in parsing of 'font' attributes during tag transformation
1438 1438
1439 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent 1439 1.1.1 - 27 September 2008. Better nesting correction when omitable closing tags are absent
1440 1440
1441 1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["tidy"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug 1441 1.1 - 29 June 2008. '$config["hook_tag"]' and '$config["tidy"]' introduced for custom tag/attribute check/modification/injection and output compaction/beautification; fixed a regex-in-$spec parsing bug
1442 1442
1443 1.0.9 - 11 June 2008. Fix for a bug in checks for invalid HTML code-point entities 1443 1.0.9 - 11 June 2008. Fix for a bug in checks for invalid HTML code-point entities
1444 1444
1445 1.0.8 - 15 May 2008. Permit 'bordercolor' attribute for 'table', 'td' and 'tr' 1445 1.0.8 - 15 May 2008. Permit 'bordercolor' attribute for 'table', 'td' and 'tr'
1446 1446
1447 1.0.7 - 1 May 2008. Support for 'wmode' attribute for 'embed'; '$config["show_setting"]' introduced; improved '$config["elements"]' evaluation 1447 1.0.7 - 1 May 2008. Support for 'wmode' attribute for 'embed'; '$config["show_setting"]' introduced; improved '$config["elements"]' evaluation
1448 1448
1449 1.0.6 - 20 April 2008. '$config["and_mark"]' introduced 1449 1.0.6 - 20 April 2008. '$config["and_mark"]' introduced
1450 1450
1451 1.0.5 - 12 March 2008. 'style' URL schemes essentially disallowed when $config 'safe' is on; improved regex for CSS expression search 1451 1.0.5 - 12 March 2008. 'style' URL schemes essentially disallowed when $config 'safe' is on; improved regex for CSS expression search
1452 1452
1453 1.0.4 - 10 March 2008. Improved corrections for 'blockquote', 'form', 'map' and 'noscript' 1453 1.0.4 - 10 March 2008. Improved corrections for 'blockquote', 'form', 'map' and 'noscript'
1454 1454
1455 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); fix for a bug allowing 'td' directly inside 'table'; '$config["safe"]' introduced 1455 1.0.3 - 3 March 2008. Character entities for soft-hyphens are now replaced with spaces (instead of being removed); fix for a bug allowing 'td' directly inside 'table'; '$config["safe"]' introduced
1456 1456
1457 1.0.2 - 13 February 2008. Improved implementation of '$config["keep_bad"]' 1457 1.0.2 - 13 February 2008. Improved implementation of '$config["keep_bad"]'
1458 1458
1459 1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions ('hl_tag()' and 'hl_prot()'); no error display with 'hl_regex()' 1459 1.0.1 - 7 November 2007. Improved regex for identifying URLs, protocols and dynamic expressions ('hl_tag()' and 'hl_prot()'); no error display with 'hl_regex()'
1460 1460
1461 1.0 - 2 November 2007. First release 1461 1.0 - 2 November 2007. First release
1462 1462
1463 1463
1464-- 4.4 Testing ----------------------------------------------------o 1464-- 4.4 Testing ----------------------------------------------------o
1465 1465
1466 1466
1467 To test htmLawed using a form interface, a demo:- htmLawedTest.php web-page is provided with the htmLawed distribution ('htmLawed.php' and 'htmLawedTest.php' should be in the same directory on the web-server). A file with test-cases:- htmLawed_TESTCASE.txt is also provided. 1467 To test htmLawed using a form interface, a demo:- htmLawedTest.php web-page is provided with the htmLawed distribution ('htmLawed.php' and 'htmLawedTest.php' should be in the same directory on the web-server). A file with test-cases:- htmLawed_TESTCASE.txt is also provided.
1468 1468
1469 1469
1470-- 4.5 Upgrade, & old versions ------------------------------------o 1470-- 4.5 Upgrade, & old versions ------------------------------------o
1471 1471
1472 1472
1473 Upgrading is as simple as replacing the previous version of 'htmLawed.php', assuming the file was not modified for customized features. As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content. 1473 Upgrading is as simple as replacing the previous version of 'htmLawed.php', assuming the file was not modified for customized features. As htmLawed output is almost always used in static documents, upgrading should not affect old, finalized content.
1474 1474
1475 *Note:* The following upgrades may affect the functionality of a specific htmLawed installation: 1475 *Note:* The following upgrades may affect the functionality of a specific htmLawed installation:
1476 1476
1477 (1) From version 1.1-1.1.10 to 1.1.11 or later, if a 'hook_tag' function is in use: In version 1.1.11 and later, elements in closing tags (and not just the opening tags) are also passed to the function. There are no attribute names/values to pass, so a 'hook_tag' function receives only the element name. The 'hook_tag' function therefore may have to be edited. See section:- #3.4.9. 1477 (1) From version 1.1-1.1.10 to 1.1.11 or later, if a 'hook_tag' function is in use: In version 1.1.11 and later, elements in closing tags (and not just the opening tags) are also passed to the function. There are no attribute names/values to pass, so a 'hook_tag' function receives only the element name. The 'hook_tag' function therefore may have to be edited. See section:- #3.4.9.
1478 1478
1479 (2) From version older than 1.2.beta to later, if htmLawed was used as Kses replacement with Kses code in use: In version 1.2.beta or later, htmLawed no longer provides direct support for code that uses Kses functions (see section:- #2.6). 1479 (2) From version older than 1.2.beta to later, if htmLawed was used as Kses replacement with Kses code in use: In version 1.2.beta or later, htmLawed no longer provides direct support for code that uses Kses functions (see section:- #2.6).
1480 1480
1481 (3) From version older than 1.2 to later, if htmLawed is used without '$config["safe"]' set to 1: Unlike previous versions, htmLawed version 1.2 and later permit 'data' and 'javascript' URL schemes by default (see section:- #3.4.3). 1481 (3) From version older than 1.2 to later, if htmLawed is used without '$config["safe"]' set to 1: Unlike previous versions, htmLawed version 1.2 and later permit 'data' and 'javascript' URL schemes by default (see section:- #3.4.3).
1482 1482
1483 Old versions of htmLawed may be available online. E.g., for version 1.0, check http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip; for 1.1.1, http://www.bioinformatics.org/phplabware/downloads/htmLawed111.zip; and for 1.1.22, http://www.bioinformatics.org/phplabware/downloads/htmLawed1122.zip. 1483 Old versions of htmLawed may be available online. E.g., for version 1.0, check http://www.bioinformatics.org/phplabware/downloads/htmLawed1.zip; for 1.1.1, http://www.bioinformatics.org/phplabware/downloads/htmLawed111.zip; and for 1.1.22, http://www.bioinformatics.org/phplabware/downloads/htmLawed1122.zip.
1484 1484
1485 1485
1486-- 4.6 Comparison with 'HTMLPurifier' -----------------------------o 1486-- 4.6 Comparison with 'HTMLPurifier' -----------------------------o
1487 1487
1488 1488
1489 The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2015): 1489 The HTMLPurifier PHP library by Edward Yang is a very good HTML filtering script that uses object oriented PHP code. Compared to htmLawed, it (as of year 2015):
1490 1490
1491 * does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2) 1491 * does not support PHP versions older than 5.0 (HTMLPurifier dropped PHP 4 support after version 2)
1492 1492
1493 * is 15-20 times bigger (scores of files totalling more than 750 kb) 1493 * is 15-20 times bigger (scores of files totalling more than 750 kb)
1494 1494
1495 * consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory) 1495 * consumes 10-15 times more RAM memory (just including the HTMLPurifier files without calling the filter requires a few MBs of memory)
1496 1496
1497 * is expectedly slower 1497 * is expectedly slower
1498 1498
1499 * lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification) 1499 * lacks many of the extra features of htmLawed (like entity conversions and code compaction/beautification)
1500 1500
1501 * has poor documentation 1501 * has poor documentation
1502 1502
1503 However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier website:- http://htmlpurifier.org for updated information. 1503 However, HTMLPurifier has finer checks for character encodings and attribute values, and can log warnings and errors. Visit the HTMLPurifier website:- http://htmlpurifier.org for updated information.
1504 1504
1505 1505
1506-- 4.7 Use through application plug-ins/modules -------------------o 1506-- 4.7 Use through application plug-ins/modules -------------------o
1507 1507
1508 1508
1509 Plug-ins/modules to implement htmLawed in applications such as Drupal may have been developed. Check the application websites and the htmLawed forum:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. 1509 Plug-ins/modules to implement htmLawed in applications such as Drupal may have been developed. Check the application websites and the htmLawed forum:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed.
1510 1510
1511 1511
1512-- 4.8 Use in non-PHP applications --------------------------------o 1512-- 4.8 Use in non-PHP applications --------------------------------o
1513 1513
1514 1514
1515 Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed. 1515 Non-PHP applications written in Python, Ruby, etc., may be able to use htmLawed through system calls to the PHP engine. Such code may have been documented on the internet. Also check the forum on the htmLawed site:- http://www.bioinformatics.org/phplabware/internal_utilities/htmLawed.
1516 1516
1517 1517
1518-- 4.9 Donate -----------------------------------------------------o 1518-- 4.9 Donate -----------------------------------------------------o
1519 1519
1520 1520
1521 A donation in any currency and amount to appreciate or support this software can be sent by PayPal:- http://paypal.com to this email address: drpatnaik at yahoo dot com. 1521 A donation in any currency and amount to appreciate or support this software can be sent by PayPal:- http://paypal.com to this email address: drpatnaik at yahoo dot com.
1522 1522
1523 1523
1524-- 4.10 Acknowledgements ------------------------------------------o 1524-- 4.10 Acknowledgements ------------------------------------------o
1525 1525
1526 1526
1527 Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Alexandre Chouinard, Ulf Harnhammer, Gareth Heyes, Hakre, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Psych0tr1a, Lincoln Russell, Tomas Sykorka, Harro Verton, Edward Yang, and many anonymous users. 1527 Nicholas Alipaz, Bryan Blakey, Pádraic Brady, Dac Chartrand, Alexandre Chouinard, Ulf Harnhammer, Gareth Heyes, Hakre, Klaus Leithoff, Lukasz Pilorz, Shelley Powers, Psych0tr1a, Lincoln Russell, Tomas Sykorka, Harro Verton, Edward Yang, and many anonymous users.
1528 1528
1529 Thank you! 1529 Thank you!
1530 1530
1531 1531
1532== 5 Appendices ==================================================oo 1532== 5 Appendices ==================================================oo
1533 1533
1534 1534
1535-- 5.1 Characters discouraged in XHTML ----------------------------- 1535-- 5.1 Characters discouraged in XHTML -----------------------------
1536 1536
1537 1537
1538 Characters represented by the following hexadecimal code-points are `not` invalid, even though some validators may issue messages stating otherwise. 1538 Characters represented by the following hexadecimal code-points are `not` invalid, even though some validators may issue messages stating otherwise.
1539 1539
1540 '7f' to '84', '86' to '9f', 'fdd0' to 'fddf', '1fffe', '1ffff', '2fffe', '2ffff', '3fffe', '3ffff', '4fffe', '4ffff', '5fffe', '5ffff', '6fffe', '6ffff', '7fffe', '7ffff', '8fffe', '8ffff', '9fffe', '9ffff', 'afffe', 'affff', 'bfffe', 'bffff', 'cfffe', 'cffff', 'dfffe', 'dffff', 'efffe', 'effff', 'ffffe', 'fffff', '10fffe' and '10ffff' 1540 '7f' to '84', '86' to '9f', 'fdd0' to 'fddf', '1fffe', '1ffff', '2fffe', '2ffff', '3fffe', '3ffff', '4fffe', '4ffff', '5fffe', '5ffff', '6fffe', '6ffff', '7fffe', '7ffff', '8fffe', '8ffff', '9fffe', '9ffff', 'afffe', 'affff', 'bfffe', 'bffff', 'cfffe', 'cffff', 'dfffe', 'dffff', 'efffe', 'effff', 'ffffe', 'fffff', '10fffe' and '10ffff'
1541 1541
1542 1542
1543-- 5.2 Valid attribute-element combinations -----------------------o 1543-- 5.2 Valid attribute-element combinations -----------------------o
1544 1544
1545 1545
1546 * includes deprecated attributes (marked '^'), attributes for microdata (marked '*'), the non-standard 'bordercolor', and new-in-HTML5 attributes (marked '~'); can have multiple comma-separated values (marked '%'); can have multiple space-separated values (marked '$') 1546 * includes deprecated attributes (marked '^'), attributes for microdata (marked '*'), the non-standard 'bordercolor', and new-in-HTML5 attributes (marked '~'); can have multiple comma-separated values (marked '%'); can have multiple space-separated values (marked '$')
1547 * only non-frameset, HTML body elements 1547 * only non-frameset, HTML body elements
1548 * 'name' for 'a' and 'map', and 'lang' are invalid in XHTML 1.1 1548 * 'name' for 'a' and 'map', and 'lang' are invalid in XHTML 1.1
1549 * 'target' is valid for 'a' in XHTML 1.1 and higher 1549 * 'target' is valid for 'a' in XHTML 1.1 and higher
1550 * 'xml:space' is only for XHTML 1.1 1550 * 'xml:space' is only for XHTML 1.1
1551 1551
1552 abbr - td, th 1552 abbr - td, th
1553 accept - form, input 1553 accept - form, input
1554 accept-charset - form 1554 accept-charset - form
1555 action - form 1555 action - form
1556 align - applet, caption^, col, colgroup, div^, embed, h1^, h2^, h3^, h4^, h5^, h6^, hr^, iframe, img^, input^, legend^, object^, p^, table^, tbody, td, tfoot, th, thead, tr 1556 align - applet, caption^, col, colgroup, div^, embed, h1^, h2^, h3^, h4^, h5^, h6^, hr^, iframe, img^, input^, legend^, object^, p^, table^, tbody, td, tfoot, th, thead, tr
1557 allowfullscreen - iframe 1557 allowfullscreen - iframe
1558 alt - applet, area, img, input 1558 alt - applet, area, img, input
1559 archive - applet, object 1559 archive - applet, object
1560 async~ - script 1560 async~ - script
1561 autocomplete~ - input 1561 autocomplete~ - input
1562 autofocus~ - button, input, keygen, select, textarea 1562 autofocus~ - button, input, keygen, select, textarea
1563 autoplay~ - audio, video 1563 autoplay~ - audio, video
1564 axis - td, th 1564 axis - td, th
1565 bgcolor - embed, table^, td^, th^, tr^ 1565 bgcolor - embed, table^, td^, th^, tr^
1566 border - img, object^, table 1566 border - img, object^, table
1567 bordercolor - table, td, tr 1567 bordercolor - table, td, tr
1568 cellpadding - table 1568 cellpadding - table
1569 cellspacing - table 1569 cellspacing - table
1570 challenge~ - keygen 1570 challenge~ - keygen
1571 char - col, colgroup, tbody, td, tfoot, th, thead, tr 1571 char - col, colgroup, tbody, td, tfoot, th, thead, tr
1572 charoff - col, colgroup, tbody, td, tfoot, th, thead, tr 1572 charoff - col, colgroup, tbody, td, tfoot, th, thead, tr
1573 charset - a, script 1573 charset - a, script
1574 checked - command, input 1574 checked - command, input
1575 cite - blockquote, del, ins, q 1575 cite - blockquote, del, ins, q
1576 classid - object 1576 classid - object
1577 clear - br^ 1577 clear - br^
1578 code - applet 1578 code - applet
1579 codebase - object, applet 1579 codebase - object, applet
1580 codetype - object 1580 codetype - object
1581 color - font 1581 color - font
1582 cols - textarea 1582 cols - textarea
1583 colspan - td, th 1583 colspan - td, th
1584 compact - dir, dl^, menu, ol^, ul^ 1584 compact - dir, dl^, menu, ol^, ul^
1585 content - meta 1585 content - meta
1586 controls~ - audio, video 1586 controls~ - audio, video
1587 coords - area, a 1587 coords - area, a
1588 crossorigin~ - img 1588 crossorigin~ - img
1589 data - object 1589 data - object
1590 datetime - del, ins, time 1590 datetime - del, ins, time
1591 declare - object 1591 declare - object
1592 default~ - track 1592 default~ - track
1593 defer - script 1593 defer - script
1594 dir - bdo 1594 dir - bdo
1595 dirname~ - input, textarea 1595 dirname~ - input, textarea
1596 disabled - button, command, fieldset, input, keygen, optgroup, option, select, textarea 1596 disabled - button, command, fieldset, input, keygen, optgroup, option, select, textarea
1597 download~ - a 1597 download~ - a
1598 enctype - form 1598 enctype - form
1599 face - font 1599 face - font
1600 flashvars** - embed 1600 flashvars** - embed
1601 for - label, output 1601 for - label, output
1602 form~ - button, fieldset, input, keygen, label, object, output, select, textarea 1602 form~ - button, fieldset, input, keygen, label, object, output, select, textarea
1603 formaction~ - button, input 1603 formaction~ - button, input
1604 formenctype~ - button, input 1604 formenctype~ - button, input
1605 formmethod~ - button, input 1605 formmethod~ - button, input
1606 formnovalidate~ - button, input 1606 formnovalidate~ - button, input
1607 formtarget~ - button, input 1607 formtarget~ - button, input
1608 frame - table 1608 frame - table
1609 frameborder - iframe 1609 frameborder - iframe
1610 headers - td, th 1610 headers - td, th
1611 height - applet, canvas, embed, iframe, img, input, object, td^, th^, video 1611 height - applet, canvas, embed, iframe, img, input, object, td^, th^, video
1612 high~ - meter 1612 high~ - meter
1613 href - a, area, link 1613 href - a, area, link
1614 hreflang - a, area, link 1614 hreflang - a, area, link
1615 hspace - applet, embed, img^, object^ 1615 hspace - applet, embed, img^, object^
1616 icon~ - command 1616 icon~ - command
1617 ismap - img, input 1617 ismap - img, input
1618 keytype~ - keygen 1618 keytype~ - keygen
1619 keyparams~ - keygen 1619 keyparams~ - keygen
1620 kind~ - track 1620 kind~ - track
1621 label - command, menu, option, optgroup, track 1621 label - command, menu, option, optgroup, track
1622 language - script^ 1622 language - script^
1623 list~ - input 1623 list~ - input
1624 longdesc - img, iframe 1624 longdesc - img, iframe
1625 loop~ - audio, video 1625 loop~ - audio, video
1626 low~ - meter 1626 low~ - meter
1627 marginheight - iframe 1627 marginheight - iframe
1628 marginwidth - iframe 1628 marginwidth - iframe
1629 max~ - input, meter, progress 1629 max~ - input, meter, progress
1630 maxlength - input, textarea 1630 maxlength - input, textarea
1631 media~ - a, area, link, source, style 1631 media~ - a, area, link, source, style
1632 mediagroup~ - audio, video 1632 mediagroup~ - audio, video
1633 method - form 1633 method - form
1634 min~ - input, meter 1634 min~ - input, meter
1635 model** - embed 1635 model** - embed
1636 multiple - input, select 1636 multiple - input, select
1637 muted~ - audio, video 1637 muted~ - audio, video
1638 name - a^, applet^, button, embed, fieldset, form^, iframe^, img^, input, keygen, map^, object, output, param, select, textarea 1638 name - a^, applet^, button, embed, fieldset, form^, iframe^, img^, input, keygen, map^, object, output, param, select, textarea
1639 nohref - area 1639 nohref - area
1640 noshade - hr^ 1640 noshade - hr^
1641 novalidate~ - form 1641 novalidate~ - form
1642 nowrap - td^, th^ 1642 nowrap - td^, th^
1643 object - applet 1643 object - applet
1644 open~ - details 1644 open~ - details
1645 optimum~ - meter 1645 optimum~ - meter
1646 pattern~ - input 1646 pattern~ - input
1647 ping~ - a, area 1647 ping~ - a, area
1648 placeholder~ - input, textarea 1648 placeholder~ - input, textarea
1649 pluginspage** - embed 1649 pluginspage** - embed
1650 pluginurl** - embed 1650 pluginurl** - embed
1651 poster~ - video 1651 poster~ - video
1652 pqg~ - keygen 1652 pqg~ - keygen
1653 preload~ - audio, video 1653 preload~ - audio, video
1654 prompt - isindex 1654 prompt - isindex
1655 pubdate~ - time 1655 pubdate~ - time
1656 radiogroup* - command 1656 radiogroup* - command
1657 readonly - input, textarea 1657 readonly - input, textarea
1658 required~ - input, select, textarea 1658 required~ - input, select, textarea
1659 rel$ - a, area, link 1659 rel$ - a, area, link
1660 rev - a 1660 rev - a
1661 reversed~ - old 1661 reversed~ - old
1662 rows - textarea 1662 rows - textarea
1663 rowspan - td, th 1663 rowspan - td, th
1664 rules - table 1664 rules - table
1665 sandbox~ - iframe 1665 sandbox~ - iframe
1666 scope - td, th 1666 scope - td, th
1667 scoped~ - style 1667 scoped~ - style
1668 scrolling - iframe 1668 scrolling - iframe
1669 seamless~ - iframe 1669 seamless~ - iframe
1670 selected - option 1670 selected - option
1671 shape - area, a 1671 shape - area, a
1672 size - font, hr^, input, select 1672 size - font, hr^, input, select
1673 sizes~ - link 1673 sizes~ - link
1674 span - col, colgroup 1674 span - col, colgroup
1675 src - audio, embed, iframe, img, input, script, source, track, video 1675 src - audio, embed, iframe, img, input, script, source, track, video
1676 srcdoc~ - iframe 1676 srcdoc~ - iframe
1677 srclang~ - track 1677 srclang~ - track
1678 srcset~% - img 1678 srcset~% - img
1679 standby - object 1679 standby - object
1680 start - ol 1680 start - ol
1681 step~ - input 1681 step~ - input
1682 summary - table 1682 summary - table
1683 target - a, area, form 1683 target - a, area, form
1684 type - a, area, button, command, embed, input, li, link, menu, object, ol, param, script, source, style, ul 1684 type - a, area, button, command, embed, input, li, link, menu, object, ol, param, script, source, style, ul
1685 typemustmatch~ - object 1685 typemustmatch~ - object
1686 usemap - img, input, object 1686 usemap - img, input, object
1687 valign - col, colgroup, tbody, td, tfoot, th, thead, tr 1687 valign - col, colgroup, tbody, td, tfoot, th, thead, tr
1688 value - button, data, input, li, meter, option, param, progress 1688 value - button, data, input, li, meter, option, param, progress
1689 valuetype - param 1689 valuetype - param
1690 vspace - applet, embed, img^, object^ 1690 vspace - applet, embed, img^, object^
1691 width - applet, canvas, col, colgroup, embed, hr^, iframe, img, input, object, pre^, table, td^, th^, video 1691 width - applet, canvas, col, colgroup, embed, hr^, iframe, img, input, object, pre^, table, td^, th^, video
1692 wmode - embed 1692 wmode - embed
1693 wrap~ - textarea 1693 wrap~ - textarea
1694 1694
1695 The following attributes, including event-specific ones and attributes of ARIA and microdata specifications, are considered global and allowed in all elements: 1695 The following attributes, including event-specific ones and attributes of ARIA and microdata specifications, are considered global and allowed in all elements:
1696 1696
1697 accesskey, aria-activedescendant, aria-atomic, aria-autocomplete, aria-busy, aria-checked, aria-controls, aria-describedby, aria-disabled, aria-dropeffect, aria-expanded, aria-flowto, aria-grabbed, aria-haspopup, aria-hidden, aria-invalid, aria-label, aria-labelledby, aria-level, aria-live, aria-multiline, aria-multiselectable, aria-orientation, aria-owns, aria-posinset, aria-pressed, aria-readonly, aria-relevant, aria-required, aria-selected, aria-setsize, aria-sort, aria-valuemax, aria-valuemin, aria-valuenow, aria-valuetext, class$, contenteditable, contextmenu, dir, draggable, dropzone, hidden, id, inert, itemid, itemprop, itemref, itemscope, itemtype, lang, onabort, onblur, oncanplay, oncanplaythrough, onchange, onclick, oncontextmenu, oncopy, oncuechange, oncut, ondblclick, ondrag, ondragend, ondragenter, ondragleave, ondragover, ondragstart, ondrop, ondurationchange, onemptied, onended, onerror, onfocus, onformchange, onforminput, oninput, oninvalid, onkeydown, onkeypress, onkeyup, onload, onloadeddata, onloadedmetadata, onloadstart, onlostpointercapture, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onmousewheel, onpaste, onpause, onplay, onplaying, onpointercancel, ongotpointercapture, onpointerdown, onpointerenter, onpointerleave, onpointermove, onpointerout, onpointerover, onpointerup, onprogress, onratechange, onreadystatechange, onreset, onsearch, onscroll, onseeked, onseeking, onselect, onshow, onstalled, onsubmit, onsuspend, ontimeupdate, ontoggle, ontouchcancel, ontouchend, ontouchmove, ontouchstart, onvolumechange, onwaiting, onwheel, role, spellcheck, style, tabindex, title, translate, xmlns, xml:base, xml:lang, xml:space 1697 accesskey, aria-activedescendant, aria-atomic, aria-autocomplete, aria-busy, aria-checked, aria-controls, aria-describedby, aria-disabled, aria-dropeffect, aria-expanded, aria-flowto, aria-grabbed, aria-haspopup, aria-hidden, aria-invalid, aria-label, aria-labelledby, aria-level, aria-live, aria-multiline, aria-multiselectable, aria-orientation, aria-owns, aria-posinset, aria-pressed, aria-readonly, aria-relevant, aria-required, aria-selected, aria-setsize, aria-sort, aria-valuemax, aria-valuemin, aria-valuenow, aria-valuetext, class$, contenteditable, contextmenu, dir, draggable, dropzone, hidden, id, inert, itemid, itemprop, itemref, itemscope, itemtype, lang, onabort, onblur, oncanplay, oncanplaythrough, onchange, onclick, oncontextmenu, oncopy, oncuechange, oncut, ondblclick, ondrag, ondragend, ondragenter, ondragleave, ondragover, ondragstart, ondrop, ondurationchange, onemptied, onended, onerror, onfocus, onformchange, onforminput, oninput, oninvalid, onkeydown, onkeypress, onkeyup, onload, onloadeddata, onloadedmetadata, onloadstart, onlostpointercapture, onmousedown, onmousemove, onmouseout, onmouseover, onmouseup, onmousewheel, onpaste, onpause, onplay, onplaying, onpointercancel, ongotpointercapture, onpointerdown, onpointerenter, onpointerleave, onpointermove, onpointerout, onpointerover, onpointerup, onprogress, onratechange, onreadystatechange, onreset, onsearch, onscroll, onseeked, onseeking, onselect, onshow, onstalled, onsubmit, onsuspend, ontimeupdate, ontoggle, ontouchcancel, ontouchend, ontouchmove, ontouchstart, onvolumechange, onwaiting, onwheel, role, spellcheck, style, tabindex, title, translate, xmlns, xml:base, xml:lang, xml:space
1698 1698
1699 Custom `data-*` attributes, where the first three characters of the value of `star` (*) after lower-casing do not equal 'xml' and the value of `star` does not have a colon (:), equal-to (=), newline, solidus (/), space, tab, or any A-Z character, are also considered global and allowed in all elements. 1699 Custom `data-*` attributes, where the first three characters of the value of `star` (*) after lower-casing do not equal 'xml' and the value of `star` does not have a colon (:), equal-to (=), newline, solidus (/), space, tab, or any A-Z character, are also considered global and allowed in all elements.
1700 1700
1701 1701
1702-- 5.3 CSS 2.1 properties accepting URLs --------------------------o 1702-- 5.3 CSS 2.1 properties accepting URLs --------------------------o
1703 1703
1704 1704
1705 background 1705 background
1706 background-image 1706 background-image
1707 content 1707 content
1708 cue-after 1708 cue-after
1709 cue-before 1709 cue-before
1710 cursor 1710 cursor
1711 list-style 1711 list-style
1712 list-style-image 1712 list-style-image
1713 play-during 1713 play-during
1714 1714
1715 1715
1716-- 5.4 Microsoft Windows 1252 character replacements --------------o 1716-- 5.4 Microsoft Windows 1252 character replacements --------------o
1717 1717
1718 1718
1719 Key: 'd' double, 'l' left, 'q' quote, 'r' right, 's.' single 1719 Key: 'd' double, 'l' left, 'q' quote, 'r' right, 's.' single
1720 1720
1721 Code-point (decimal) - hexadecimal value - replacement entity - represented character 1721 Code-point (decimal) - hexadecimal value - replacement entity - represented character
1722 1722
1723 127 - 7f - (removed) - (not used) 1723 127 - 7f - (removed) - (not used)
1724 128 - 80 - &#8364; - euro 1724 128 - 80 - &#8364; - euro
1725 129 - 81 - (removed) - (not used) 1725 129 - 81 - (removed) - (not used)
1726 130 - 82 - &#8218; - baseline s. q 1726 130 - 82 - &#8218; - baseline s. q
1727 131 - 83 - &#402; - florin 1727 131 - 83 - &#402; - florin
1728 132 - 84 - &#8222; - baseline d q 1728 132 - 84 - &#8222; - baseline d q
1729 133 - 85 - &#8230; - ellipsis 1729 133 - 85 - &#8230; - ellipsis
1730 134 - 86 - &#8224; - dagger 1730 134 - 86 - &#8224; - dagger
1731 135 - 87 - &#8225; - d dagger 1731 135 - 87 - &#8225; - d dagger
1732 136 - 88 - &#710; - circumflex accent 1732 136 - 88 - &#710; - circumflex accent
1733 137 - 89 - &#8240; - permile 1733 137 - 89 - &#8240; - permile
1734 138 - 8a - &#352; - S Hacek 1734 138 - 8a - &#352; - S Hacek
1735 139 - 8b - &#8249; - l s. guillemet 1735 139 - 8b - &#8249; - l s. guillemet
1736 140 - 8c - &#338; - OE ligature 1736 140 - 8c - &#338; - OE ligature
1737 141 - 8d - (removed) - (not used) 1737 141 - 8d - (removed) - (not used)
1738 142 - 8e - &#381; - Z dieresis 1738 142 - 8e - &#381; - Z dieresis
1739 143 - 8f - (removed) - (not used) 1739 143 - 8f - (removed) - (not used)
1740 144 - 90 - (removed) - (not used) 1740 144 - 90 - (removed) - (not used)
1741 145 - 91 - &#8216; - l s. q 1741 145 - 91 - &#8216; - l s. q
1742 146 - 92 - &#8217; - r s. q 1742 146 - 92 - &#8217; - r s. q
1743 147 - 93 - &#8220; - l d q 1743 147 - 93 - &#8220; - l d q
1744 148 - 94 - &#8221; - r d q 1744 148 - 94 - &#8221; - r d q
1745 149 - 95 - &#8226; - bullet 1745 149 - 95 - &#8226; - bullet
1746 150 - 96 - &#8211; - en dash 1746 150 - 96 - &#8211; - en dash
1747 151 - 97 - &#8212; - em dash 1747 151 - 97 - &#8212; - em dash
1748 152 - 98 - &#732; - tilde accent 1748 152 - 98 - &#732; - tilde accent
1749 153 - 99 - &#8482; - trademark 1749 153 - 99 - &#8482; - trademark
1750 154 - 9a - &#353; - s Hacek 1750 154 - 9a - &#353; - s Hacek
1751 155 - 9b - &#8250; - r s. guillemet 1751 155 - 9b - &#8250; - r s. guillemet
1752 156 - 9c - &#339; - oe ligature 1752 156 - 9c - &#339; - oe ligature
1753 157 - 9d - (removed) - (not used) 1753 157 - 9d - (removed) - (not used)
1754 158 - 9e - &#382; - z dieresis 1754 158 - 9e - &#382; - z dieresis
1755 159 - 9f - &#376; - Y dieresis 1755 159 - 9f - &#376; - Y dieresis
1756 1756
1757 1757
1758-- 5.5 URL format -------------------------------------------------o 1758-- 5.5 URL format -------------------------------------------------o
1759 1759
1760 1760
1761 An `absolute` URL has a 'protocol' or 'scheme', a 'network location' or 'hostname', and, optional 'path', 'parameters', 'query' and 'fragment' segments. Thus, an absolute URL has this generic structure: 1761 An `absolute` URL has a 'protocol' or 'scheme', a 'network location' or 'hostname', and, optional 'path', 'parameters', 'query' and 'fragment' segments. Thus, an absolute URL has this generic structure:
1762 1762
1763 (scheme) : (//network location) /(path) ;(parameters) ?(query) #(fragment) 1763 (scheme) : (//network location) /(path) ;(parameters) ?(query) #(fragment)
1764 1764
1765 The schemes can only contain letters, digits, '+', '.' and '-'. Hostname is the portion after the '//' and up to the first '/' (if any; else, up to the end) when ':' is followed by a '//' (e.g., 'abc.com' in 'ftp://abc.com/def'); otherwise, it consists of everything after the ':' (e.g., 'def@abc.com' in mailto:def@abc.com'). 1765 The schemes can only contain letters, digits, '+', '.' and '-'. Hostname is the portion after the '//' and up to the first '/' (if any; else, up to the end) when ':' is followed by a '//' (e.g., 'abc.com' in 'ftp://abc.com/def'); otherwise, it consists of everything after the ':' (e.g., 'def@abc.com' in mailto:def@abc.com').
1766 1766
1767 `Relative` URLs do not have explicit schemes and network locations; such values are inherited from a `base` URL. 1767 `Relative` URLs do not have explicit schemes and network locations; such values are inherited from a `base` URL.
1768 1768
1769 1769
1770-- 5.6 Brief on htmLawed code -------------------------------------o 1770-- 5.6 Brief on htmLawed code -------------------------------------o
1771 1771
1772 1772
1773 Much of the code's logic and reasoning can be understood from the documentation above. 1773 Much of the code's logic and reasoning can be understood from the documentation above.
1774 1774
1775 The *output* of htmLawed is a text string containing the processed input. There is no custom error tracking. 1775 The *output* of htmLawed is a text string containing the processed input. There is no custom error tracking.
1776 1776
1777 *Function arguments* for htmLawed are: 1777 *Function arguments* for htmLawed are:
1778 1778
1779 * '$in' - first argument; a text string; the *input text* to be processed. Any extraneous slashes added by PHP when `magic quotes` are enabled should be removed beforehand using PHP's 'stripslashes()' function. 1779 * '$in' - first argument; a text string; the *input text* to be processed. Any extraneous slashes added by PHP when `magic quotes` are enabled should be removed beforehand using PHP's 'stripslashes()' function.
1780 1780
1781 * '$config' - second argument; an associative array; optional; named '$C' within htmLawed code. The array has keys with names like 'balance' and 'keep_bad', and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the *configurable parameters* (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through '$config'. `Finalized` '$config' is thus a filtered and possibly larger array. 1781 * '$config' - second argument; an associative array; optional; named '$C' within htmLawed code. The array has keys with names like 'balance' and 'keep_bad', and the values, which can be boolean, string, or array, depending on the key, are read to accordingly set the *configurable parameters* (indicated by the keys). All configurable parameters receive some default value if the value to be used is not specified by the user through '$config'. `Finalized` '$config' is thus a filtered and possibly larger array.
1782 1782
1783 * '$spec' - third argument; a text string; optional. The string has rules, written in an htmLawed-designated format, *specifying* element-specific attribute and attribute value restrictions. Function 'hl_spec()' is used to convert the string to an associative-array, named '$S' within htmLawed code, for internal use. `Finalized` '$spec' is thus an array. 1783 * '$spec' - third argument; a text string; optional. The string has rules, written in an htmLawed-designated format, *specifying* element-specific attribute and attribute value restrictions. Function 'hl_spec()' is used to convert the string to an associative-array, named '$S' within htmLawed code, for internal use. `Finalized` '$spec' is thus an array.
1784 1784
1785 `Finalized` '$config' and '$spec' are made *global variables* while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the `finalized` values, the 'show_settings' parameter of '$config' should be used). Depending on '$config', another global variable 'hl_Ids', to track 'id' attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing. 1785 `Finalized` '$config' and '$spec' are made *global variables* while htmLawed is at work. Values of any pre-existing global variables with same names are noted, and their values are restored after htmLawed finishes processing the input (to capture the `finalized` values, the 'show_settings' parameter of '$config' should be used). Depending on '$config', another global variable 'hl_Ids', to track 'id' attribute values for uniqueness, may be set. Unlike the other two variables, this one is not reset (or unset) post-processing.
1786 1786
1787 Except for the main 'htmLawed()' function, htmLawed's functions are *name-spaced* using the 'hl_' prefix. The *functions* and their roles are: 1787 Except for the main 'htmLawed()' function, htmLawed's functions are *name-spaced* using the 'hl_' prefix. The *functions* and their roles are:
1788 1788
1789 * 'hl_attrval' - check attribute values against '$spec' 1789 * 'hl_attrval' - check attribute values against '$spec'
1790 * 'hl_bal' - balance tags and ensure proper nesting 1790 * 'hl_bal' - balance tags and ensure proper nesting
1791 * 'hl_cmtcd' - handle CDATA sections and HTML comments 1791 * 'hl_cmtcd' - handle CDATA sections and HTML comments
1792 * 'hl_ent' - handle character entities 1792 * 'hl_ent' - handle character entities
1793 * 'hl_prot' - check a URL scheme/protocol 1793 * 'hl_prot' - check a URL scheme/protocol
1794 * 'hl_regex' - check syntax of a regular expression 1794 * 'hl_regex' - check syntax of a regular expression
1795 * 'hl_spec' - convert user-supplied '$spec' value to one used internally 1795 * 'hl_spec' - convert user-supplied '$spec' value to one used internally
1796 * 'hl_tag' - handle element tags and attributes 1796 * 'hl_tag' - handle element tags and attributes
1797 * 'hl_tag2' - transform element tags 1797 * 'hl_tag2' - transform element tags
1798 * 'hl_tidy' - compact/beautify HTML 1798 * 'hl_tidy' - compact/beautify HTML
1799 * 'hl_version' - report htmLawed version 1799 * 'hl_version' - report htmLawed version
1800 * 'htmLawed' - main function 1800 * 'htmLawed' - main function
1801 1801
1802 'htmLawed()' finalizes '$spec' (with the help of 'hl_spec()') and '$config', and globalizes them. Finalization of '$config' involves setting default values if an inappropriate or invalid one is supplied. This includes calling 'hl_regex()' to check well-formedness of regular expression patterns if such expressions are user-supplied through '$config'. 'htmLawed()' then removes invalid characters like nulls and 'x01' and appropriately handles entities using 'hl_ent()'. HTML comments and CDATA sections are identified and treated as per '$config' with the help of 'hl_cmtcd()'. When retained, the '<' and '>' characters identifying them, and the '<', '>' and '&' characters inside them, are replaced with control characters (code-points '1' to '5') till any tag balancing is completed. 1802 'htmLawed()' finalizes '$spec' (with the help of 'hl_spec()') and '$config', and globalizes them. Finalization of '$config' involves setting default values if an inappropriate or invalid one is supplied. This includes calling 'hl_regex()' to check well-formedness of regular expression patterns if such expressions are user-supplied through '$config'. 'htmLawed()' then removes invalid characters like nulls and 'x01' and appropriately handles entities using 'hl_ent()'. HTML comments and CDATA sections are identified and treated as per '$config' with the help of 'hl_cmtcd()'. When retained, the '<' and '>' characters identifying them, and the '<', '>' and '&' characters inside them, are replaced with control characters (code-points '1' to '5') till any tag balancing is completed.
1803 1803
1804 After this `initial processing` 'htmLawed()' identifies tags using regex and processes them with the help of 'hl_tag()' -- a large function that analyzes tag content, filtering it as per HTML standards, '$config' and '$spec'. Among other things, 'hl_tag()' transforms deprecated elements using 'hl_tag2()', removes attributes from closing tags, checks attribute values as per '$spec' rules using 'hl_attrval()', and checks URL protocols using 'hl_prot()'. 'htmLawed()' performs tag balancing and nesting checks with a call to 'hl_bal()', and optionally compacts/beautifies the output with proper white-spacing with a call to 'hl_tidy()'. The latter temporarily replaces white-space, and '<', '>' and '&' characters inside 'pre', 'script' and 'textarea' elements, and HTML comments and CDATA sections with control characters (code-points '1' to '5', and '7'). 1804 After this `initial processing` 'htmLawed()' identifies tags using regex and processes them with the help of 'hl_tag()' -- a large function that analyzes tag content, filtering it as per HTML standards, '$config' and '$spec'. Among other things, 'hl_tag()' transforms deprecated elements using 'hl_tag2()', removes attributes from closing tags, checks attribute values as per '$spec' rules using 'hl_attrval()', and checks URL protocols using 'hl_prot()'. 'htmLawed()' performs tag balancing and nesting checks with a call to 'hl_bal()', and optionally compacts/beautifies the output with proper white-spacing with a call to 'hl_tidy()'. The latter temporarily replaces white-space, and '<', '>' and '&' characters inside 'pre', 'script' and 'textarea' elements, and HTML comments and CDATA sections with control characters (code-points '1' to '5', and '7').
1805 1805
1806 htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized '$config' and '$spec' values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9). 1806 htmLawed permits the use of custom code or *hook functions* at two stages. The first, called inside 'htmLawed()', allows the input text as well as the finalized '$config' and '$spec' values to be altered right after the initial processing (see section:- #3.7). The second is called by 'hl_tag()' once the tag content is finalized (see section:- #3.4.9).
1807 1807
1808 The functionality of htmLawed is dictated by the external HTML standards. The code of htmLawed is thus written for a clear-cut aim, with not much concern for tweaking by other developers. The code is only minimally annotated with comments -- it is not meant to instruct. PHP developers familiar with the HTML specifications will see the logic, and others can always refer to the htmLawed documentation. 1808 The functionality of htmLawed is dictated by the external HTML standards. The code of htmLawed is thus written for a clear-cut aim, with not much concern for tweaking by other developers. The code is only minimally annotated with comments -- it is not meant to instruct. PHP developers familiar with the HTML specifications will see the logic, and others can always refer to the htmLawed documentation.
1809 1809
1810___________________________________________________________________oo 1810___________________________________________________________________oo
1811 1811
1812 1812
1813@@description: htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter 1813@@description: htmLawed PHP software is a free, open-source, customizable HTML input purifier and filter
1814@@encoding: utf-8 1814@@encoding: utf-8
1815@@keywords: htmLawed, HTM, HTML, HTML5, HTML 5, XHTML, XHTML5, HTML Tidy, converter, filter, formatter, purifier, sanitizer, XSS, input, PHP, software, code, script, security, cross-site scripting, hack, sanitize, remove, standards, tags, attributes, elements, Aria, Ruby, data attributes, tidy, indent, auto-indent, prettify, pretty print 1815@@keywords: htmLawed, HTM, HTML, HTML5, HTML 5, XHTML, XHTML5, HTML Tidy, converter, filter, formatter, purifier, sanitizer, XSS, input, PHP, software, code, script, security, cross-site scripting, hack, sanitize, remove, standards, tags, attributes, elements, Aria, Ruby, data attributes, tidy, indent, auto-indent, prettify, pretty print
1816@@language: en 1816@@language: en
1817@@title: htmLawed documentation 1817@@title: htmLawed documentation
diff --git a/lib/htmlawed/htmLawed_TESTCASE.txt b/lib/htmlawed/htmLawed_TESTCASE.txt
index 24b00e7..2e64421 100755
--- a/lib/htmlawed/htmLawed_TESTCASE.txt
+++ b/lib/htmlawed/htmLawed_TESTCASE.txt
@@ -1,455 +1,455 @@
1/* 1/*
2htmLawed_TESTCASE.txt, 24 September 2019 2htmLawed_TESTCASE.txt, 24 September 2019
3To test htmLawed 3To test htmLawed
4Copyright Santosh Patnaik 4Copyright Santosh Patnaik
5Dual licensed with LGPL 3 and GPL 2+ 5Dual licensed with LGPL 3 and GPL 2+
6A PHP Labware internal utility - www.bioinformatics.org/phplabware/internal_utilities/htmLawed 6A PHP Labware internal utility - www.bioinformatics.org/phplabware/internal_utilities/htmLawed
7*/ 7*/
8 8
9This file has UTF-8-encoded text with both correct and incorrect/malformed HTML/XHTML code snippets to test htmLawed (test cases/samples). The entire text may also be used as a unit. 9This file has UTF-8-encoded text with both correct and incorrect/malformed HTML/XHTML code snippets to test htmLawed (test cases/samples). The entire text may also be used as a unit.
10 10
11************************************************ 11************************************************
12when viewing this file in a web browser, set the 12when viewing this file in a web browser, set the
13character encoding to Unicode/UTF-8 13character encoding to Unicode/UTF-8
14************************************************ 14************************************************
15 15
16--------------------- start -------------------- 16--------------------- start --------------------
17 17
18<em>Try different $config and $spec values. Some text even when filtered in will not be displayed in a rendered web-page</em><br /> 18<em>Try different $config and $spec values. Some text even when filtered in will not be displayed in a rendered web-page</em><br />
19 19
20<h6>Attributes</h6> 20<h6>Attributes</h6>
21 21
22<strong>Xml:lang:</strong><a lang="en" xml:lang="en"></a>, <a lang="en"></a>, <a xml:lang="en"></a><br /> 22<strong>Xml:lang:</strong><a lang="en" xml:lang="en"></a>, <a lang="en"></a>, <a xml:lang="en"></a><br />
23<strong>Standard, predefined value, or empty attribute:</strong> <input type="text" disabled />, <input type="text" disabled="DISABLED" />, <input type="text" disabled="1" /><br /> 23<strong>Standard, predefined value, or empty attribute:</strong> <input type="text" disabled />, <input type="text" disabled="DISABLED" />, <input type="text" disabled="1" /><br />
24<strong>Required:</strong> <img />, <img alt="image" /><br /> 24<strong>Required:</strong> <img />, <img alt="image" /><br />
25<strong>Quote & space variation:</strong> <a id=id1 name=xy>a</a>, <a id='id2' name="xy">a</a>, <a id=' id3 ' name = "n" >a</a><br /> 25<strong>Quote & space variation:</strong> <a id=id1 name=xy>a</a>, <a id='id2' name="xy">a</a>, <a id=' id3 ' name = "n" >a</a><br />
26<strong>Invalid:</strong> <a id="id4" src="s">a</a><br /> 26<strong>Invalid:</strong> <a id="id4" src="s">a</a><br />
27<strong>Duplicated:</strong> <a id="id5" id="id6">a</a><br /> 27<strong>Duplicated:</strong> <a id="id5" id="id6">a</a><br />
28<strong>Deprecated:</strong> <a id="id7" target="self" name="n">a</a>, <hr noshade="noshade" /><br /> 28<strong>Deprecated:</strong> <a id="id7" target="self" name="n">a</a>, <hr noshade="noshade" /><br />
29<strong>Casing:</strong> <a HREF=""></a><br /> 29<strong>Casing:</strong> <a HREF=""></a><br />
30<strong>Custom:</strong> <img alt="image" my:data="portrait" /><br /> 30<strong>Custom:</strong> <img alt="image" my:data="portrait" /><br />
31<strong>Data-*:</strong> <a data-xml="x" data-xmnt="x" data-xmlnt="x" data-xmn:t="x" data-12="x" data-רש="x" data-xmxm="x">a</a><br /> 31<strong>Data-*:</strong> <a data-xml="x" data-xmnt="x" data-xmlnt="x" data-xmn:t="x" data-12="x" data-רש="x" data-xmxm="x">a</a><br />
32<strong>Admin-restricted?:</strong> <a href="x" onclick="alert();"></a> 32<strong>Admin-restricted?:</strong> <a href="x" onclick="alert();"></a>
33 33
34<h6>Attribute values</h6> 34<h6>Attribute values</h6>
35 35
36<strong>Duplicate ID value:</strong><a id="id8"></a>, <a id="my_id8"></a>, <a id="id8"></a><br /> 36<strong>Duplicate ID value:</strong><a id="id8"></a>, <a id="my_id8"></a>, <a id="id8"></a><br />
37(try 'my_' for prefix)<br /> 37(try 'my_' for prefix)<br />
38<strong>Double-quotes in value:</strong><a title=ab"c"></a>, <a title="ab"c"></a>, <a title='ab"c'></a><br /> 38<strong>Double-quotes in value:</strong><a title=ab"c"></a>, <a title="ab"c"></a>, <a title='ab"c'></a><br />
39(try filter for CSS expression)<br /> 39(try filter for CSS expression)<br />
40<strong>CSS expression</strong>: <div style="prop:expression();"></div><div style="prop:expression()"></div><div style="prop: expression();"></div><div style="prop : expression()"></div><div style="prop:expression(js);"></div><div style="prop:expression(js;)"></div><div style="prop: expression('js');"></div><div style="prop : expr ession('js':)"></div><div style="prop&#x3a;expression( 'js&#x40; );"></div><br /> 40<strong>CSS expression</strong>: <div style="prop:expression();"></div><div style="prop:expression()"></div><div style="prop: expression();"></div><div style="prop : expression()"></div><div style="prop:expression(js);"></div><div style="prop:expression(js;)"></div><div style="prop: expression('js');"></div><div style="prop : expr ession('js':)"></div><div style="prop&#x3a;expression( 'js&#x40; );"></div><br />
41<strong>Other:</strong> <input size="50" class="my" value="an input an input an input" />, <input size="5" class="your" value="an input" /><br /> 41<strong>Other:</strong> <input size="50" class="my" value="an input an input an input" />, <input size="5" class="your" value="an input" /><br />
42(try 'maxlen', 'maxval', etc., for 'input' in '$spec') 42(try 'maxlen', 'maxval', etc., for 'input' in '$spec')
43 43
44<h6>Blockquotes</h6> 44<h6>Blockquotes</h6>
45 45
46<blockquote>abc</blockquote><br /> 46<blockquote>abc</blockquote><br />
47<blockquote>abc<div>def</div></blockquote><br /> 47<blockquote>abc<div>def</div></blockquote><br />
48<blockquote><div>abc</div>def</blockquote><br /> 48<blockquote><div>abc</div>def</blockquote><br />
49<blockquote>abc<div>def</div>ghi</blockquote><br /> 49<blockquote>abc<div>def</div>ghi</blockquote><br />
50abc<div>def</div>ghi<br /> 50abc<div>def</div>ghi<br />
51<blockquote>QQQ<div>x</div><!-- comment --></blockquote><br /> 51<blockquote>QQQ<div>x</div><!-- comment --></blockquote><br />
52<blockquote><div>x</div><!-- comment -->QQQ</blockquote><br /> 52<blockquote><div>x</div><!-- comment -->QQQ</blockquote><br />
53<blockquote><!-- comment --><div>x</div>QQQ<div>x</div></blockquote><br /> 53<blockquote><!-- comment --><div>x</div>QQQ<div>x</div></blockquote><br />
54<blockquote><div>x<!-- comment --></div>QQQ</blockquote><p>x</p><br /> 54<blockquote><div>x<!-- comment --></div>QQQ</blockquote><p>x</p><br />
55<br /> 55<br />
56(try with blockquote parent) 56(try with blockquote parent)
57 57
58<h6>CDATA sections</h6> 58<h6>CDATA sections</h6>
59 59
60<strong>Special characters inside:</strong> <![CDATA[ ]]> ]]>, <![CDATA[ 3 < 4 > 3.5, & 4 &gt; 4 ]]><br /> 60<strong>Special characters inside:</strong> <![CDATA[ ]]> ]]>, <![CDATA[ 3 < 4 > 3.5, & 4 &gt; 4 ]]><br />
61<strong>Normal:</strong> <![CDATA[ check ]]>, <em>CDATA follows:<![CDATA[ check ]]></em><br /> 61<strong>Normal:</strong> <![CDATA[ check ]]>, <em>CDATA follows:<![CDATA[ check ]]></em><br />
62<strong>Malformed:</strong> <![cdata check ]]>, < ![CDATA check ]]>, <![CDATA check ]]>, < ![CDATA check ] ]><br /> 62<strong>Malformed:</strong> <![cdata check ]]>, < ![CDATA check ]]>, <![CDATA check ]]>, < ![CDATA check ] ]><br />
63<strong>Invalid:</strong> <em <![CDATA[ check ]]>>CDATA in tag content</em>, <table><![CDATA[ check ]]><tr><td>text not allowed</td></tr></table> 63<strong>Invalid:</strong> <em <![CDATA[ check ]]>>CDATA in tag content</em>, <table><![CDATA[ check ]]><tr><td>text not allowed</td></tr></table>
64 64
65<h6>Complex-1: deprecated elements</h6> 65<h6>Complex-1: deprecated elements</h6>
66 66
67<center> 67<center>
68The PHP <s>software</s> script used for this <strike>web-page</strike> webpage is <font style="font-weight: bold " face=arial size='+3' color = "red ">htmLawedTest.php</font>, from <u style= 'color:green'>PHP Labware</u>. 68The PHP <s>software</s> script used for this <strike>web-page</strike> webpage is <font style="font-weight: bold " face=arial size='+3' color = "red ">htmLawedTest.php</font>, from <u style= 'color:green'>PHP Labware</u>.
69</center> 69</center>
70 70
71<h6>Complex-2: deprecated attributes</h6> 71<h6>Complex-2: deprecated attributes</h6>
72 72
73<img src="s" alt="a" name="n" /><img src="s" alt="a" id="id9" name="n" /> 73<img src="s" alt="a" name="n" /><img src="s" alt="a" id="id9" name="n" />
74<br clear="left" /> 74<br clear="left" />
75<hr noshade size="1" /> 75<hr noshade size="1" />
76<img name="id10" src="s" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding:5px;" /> 76<img name="id10" src="s" align="left" alt="image" hspace="10" vspace="10" width="10em" height="20" border="1" style="padding:5px;" />
77<table width="50em" align="center" bgcolor="red"> 77<table width="50em" align="center" bgcolor="red">
78 <tr> 78 <tr>
79 <td width="20%"> 79 <td width="20%">
80 <div align="center"> 80 <div align="center">
81 <h3 align="right">Section</h3> 81 <h3 align="right">Section</h3>
82 <p align="right">Para</p> 82 <p align="right">Para</p>
83 <ol type="a" start="e"><li value="x"><a name="x">First</a> <a name="x" id="id11">item</a></li></ol> 83 <ol type="a" start="e"><li value="x"><a name="x">First</a> <a name="x" id="id11">item</a></li></ol>
84 </div> 84 </div>
85 </td> 85 </td>
86 <td width="*"> 86 <td width="*">
87 <ol type="1"><li>First item</li></ol> 87 <ol type="1"><li>First item</li></ol>
88 </td> 88 </td>
89 </tr> 89 </tr>
90 </table> 90 </table>
91<br clear="all" /> 91<br clear="all" />
92 92
93<h6>Complex-3: embed, object, area</h6> 93<h6>Complex-3: embed, object, area</h6>
94 94
95<object width="425" height="350"><param name="movie" value="http://www.youtube.com/v/ls7gi1VwdIQ"></param><embed src="http://www.youtube.com/v/ls7gi1VwdIQ" type="application/x-shockwave-flash" width="425" height="350"></embed></object><br /> 95<object width="425" height="350"><param name="movie" value="http://www.youtube.com/v/ls7gi1VwdIQ"></param><embed src="http://www.youtube.com/v/ls7gi1VwdIQ" type="application/x-shockwave-flash" width="425" height="350"></embed></object><br />
96 96
97<embed src="http://www.youtube.com/v/ls7gi1VwdIQ" type="application/x-shockwave-flash" width="425" height="350"></embed><br /> 97<embed src="http://www.youtube.com/v/ls7gi1VwdIQ" type="application/x-shockwave-flash" width="425" height="350"></embed><br />
98 98
99<object data="1.gif" type="image/gif" usemap="#map1"><map name="map1"> 99<object data="1.gif" type="image/gif" usemap="#map1"><map name="map1">
100<p>navigate the site: <a href="1" shape="REct" coOrds="0,0,118,28">1</a> | <a href="3" shape="circle" coords="184,200,60">3</a> | <a href="4" shape="poly" coords="276,0,276,28,100,200,50,50,276,0">4</a></p> 100<p>navigate the site: <a href="1" shape="REct" coOrds="0,0,118,28">1</a> | <a href="3" shape="circle" coords="184,200,60">3</a> | <a href="4" shape="poly" coords="276,0,276,28,100,200,50,50,276,0">4</a></p>
101<area href="5" shape="Rect" coords="0,0,118,28"> 101<area href="5" shape="Rect" coords="0,0,118,28">
102</map></object> 102</map></object>
103 103
104<param name="name">value</param> 104<param name="name">value</param>
105 105
106<object id="obj1"> 106<object id="obj1">
107 <param name="param1"> 107 <param name="param1">
108 <object id="obj2"> 108 <object id="obj2">
109 <param name="param2"> 109 <param name="param2">
110 </object> 110 </object>
111</object> 111</object>
112 112
113<h6>Complex-4: nested and other tables</h6> 113<h6>Complex-4: nested and other tables</h6>
114 114
115<table border="1" bgcolor="red"> <tr> <td> Cell </td> <td colspan="2" rowspan="2"> <table border="1" bgcolor="green"> <tr> <td> Cell </td> <td colspan="2" rowspan="2"> </td> </tr> <tr> <td> Cell </td> </tr> <tr> <td> Cell </td> <td> Cell </td> <td> Cell </td> </tr> </table> </td> </tr> <tr> <td> Cell </td> </tr> <tr> <td> Cell </td> <td> Cell </td> <td> Cell </td> </tr> </table><br /> 115<table border="1" bgcolor="red"> <tr> <td> Cell </td> <td colspan="2" rowspan="2"> <table border="1" bgcolor="green"> <tr> <td> Cell </td> <td colspan="2" rowspan="2"> </td> </tr> <tr> <td> Cell </td> </tr> <tr> <td> Cell </td> <td> Cell </td> <td> Cell </td> </tr> </table> </td> </tr> <tr> <td> Cell </td> </tr> <tr> <td> Cell </td> <td> Cell </td> <td> Cell </td> </tr> </table><br />
116<strong>PCDATA wrong:</strong> <table>Well<caption>Hello</caption></table><br /> 116<strong>PCDATA wrong:</strong> <table>Well<caption>Hello</caption></table><br />
117<strong>Missing tr:</strong> <table><td>Well</td></table><br /> 117<strong>Missing tr:</strong> <table><td>Well</td></table><br />
118 118
119<h6>Complex-5: pseudo, disallowed or non-HTML tags</h6> 119<h6>Complex-5: pseudo, disallowed or non-HTML tags</h6>
120 120
121(Try different 'keep_bad' values) 121(Try different 'keep_bad' values)
122<*> Pseudotags <*> 122<*> Pseudotags <*>
123<xml>Non-HTML tag xml</xml> 123<xml>Non-HTML tag xml</xml>
124<p> 124<p>
125Disallowed tag p 125Disallowed tag p
126</p> 126</p>
127<ul>Bad<li>OK</li></ul> 127<ul>Bad<li>OK</li></ul>
128 128
129<h6>Elements</h6> 129<h6>Elements</h6>
130 130
131<strong>Unbalanced:</strong> <a href="h"><em>check</a></em><br /> 131<strong>Unbalanced:</strong> <a href="h"><em>check</a></em><br />
132<strong>Non-XHTML:</strong> <div><center><dir></dir></center></div><br /> 132<strong>Non-XHTML:</strong> <div><center><dir></dir></center></div><br />
133<strong>Malformed:</strong> < a href=""></a>, <a href="" ></a>, <a href="" ></a>, <a href="" 133<strong>Malformed:</strong> < a href=""></a>, <a href="" ></a>, <a href="" ></a>, <a href=""
134></a>, <a href="">< /a>, < a href=""></a >, <img src="s" alt="a" />, <img src="s" alt="a"/ >, <imgsrc="s" alt="a" /><br /> 134></a>, <a href="">< /a>, < a href=""></a >, <img src="s" alt="a" />, <img src="s" alt="a"/ >, <imgsrc="s" alt="a" /><br />
135<strong>Invalid:</strong> <image src="s" alt="a" /><br /> 135<strong>Invalid:</strong> <image src="s" alt="a" /><br />
136<strong>Empty:</strong> <img src="s" alt="a" />, <img src="s" alt="a"></img>, <img src="s" alt="a">text</img><br /> 136<strong>Empty:</strong> <img src="s" alt="a" />, <img src="s" alt="a"></img>, <img src="s" alt="a">text</img><br />
137<strong>Content invalid:</strong> <a href="h">1<a>2</a></a><br /> 137<strong>Content invalid:</strong> <a href="h">1<a>2</a></a><br />
138<strong>Content invalid?:</strong> <form></form><br /> (try setting 'form' as parent)<br /> 138<strong>Content invalid?:</strong> <form></form><br /> (try setting 'form' as parent)<br />
139<strong>Casing:</strong> <A href=""></a><br /> 139<strong>Casing:</strong> <A href=""></a><br />
140<strong>Check for tidy:</strong> <br /><hr /></div><hr /></div><hr /></div><div>hi</div> 140<strong>Check for tidy:</strong> <br /><hr /></div><hr /></div><hr /></div><div>hi</div>
141 141
142<h6>Entities</h6> 142<h6>Entities</h6>
143 143
144<strong>Special:</strong> &amp; 3 < 2 & 5>4 and j >i >a & i<j>a<br /> 144<strong>Special:</strong> &amp; 3 < 2 & 5>4 and j >i >a & i<j>a<br />
145<strong>Padding:</strong> &#00066; &#066; &#x00066; &#x066; &#x003; &#0003;<br /> 145<strong>Padding:</strong> &#00066; &#066; &#x00066; &#x066; &#x003; &#0003;<br />
146<strong>Malformed:</strong> & #x27;, &x27;, &#x27; &TILDE;, &tilde<br /> 146<strong>Malformed:</strong> & #x27;, &x27;, &#x27; &TILDE;, &tilde<br />
147<strong>Invalid:</strong> &#x3;, &#55296;, &#03;, &#1114112;, &#xffff, &bad;<br /> 147<strong>Invalid:</strong> &#x3;, &#55296;, &#03;, &#1114112;, &#xffff, &bad;<br />
148<strong>Discouraged characters:</strong> &#x7f;, &#132;, &#64992;, &#1114110;<br /> 148<strong>Discouraged characters:</strong> &#x7f;, &#132;, &#64992;, &#1114110;<br />
149<strong>Context:</strong> '&gt;', &lt;?<br /> 149<strong>Context:</strong> '&gt;', &lt;?<br />
150<strong>Casing:</strong> &#X27;, &#x27;, &TILDE;, &tilde; 150<strong>Casing:</strong> &#X27;, &#x27;, &TILDE;, &tilde;
151<br /> 151<br />
152(also check named-to-numeric and hexdec-to-decimal, and vice versa, conversions) 152(also check named-to-numeric and hexdec-to-decimal, and vice versa, conversions)
153 153
154<h6>Format</h6> 154<h6>Format</h6>
155 155
156<strong>Valid but ill-formatted:</strong> text <!-- comment --> 156<strong>Valid but ill-formatted:</strong> text <!-- comment -->
157text <!-- 157text <!--
158A c o m m e n t --> 158A c o m m e n t -->
159<script> 159<script>
160 <![CDATA[ 160 <![CDATA[
161 code 161 code
162 ]]> 162 ]]>
163</script><!-- comment --><![CDATA[ cdata ]]> <a>text</b> text<pre id="none">p r e</pre> 163</script><!-- comment --><![CDATA[ cdata ]]> <a>text</b> text<pre id="none">p r e</pre>
164<textarea>text</textarea> <textarea> 164<textarea>text</textarea> <textarea>
165 text text 165 text text
166</textarea> text text <br /><hr /> 166</textarea> text text <br /><hr />
167text <img src="none" alt="none" /> t<em class="none">e<strong>x</strong>t</em> 167text <img src="none" alt="none" /> t<em class="none">e<strong>x</strong>t</em>
168text <img src="none" alt="none" /> <b>t<em> e <strong> x </strong> t</em></b> 168text <img src="none" alt="none" /> <b>t<em> e <strong> x </strong> t</em></b>
169 <a href="a"> text <img src="none" alt="none" /> <b>t <em> e <strong> x </strong> t</em></b> 169 <a href="a"> text <img src="none" alt="none" /> <b>t <em> e <strong> x </strong> t</em></b>
170 </a> 170 </a>
171<span style="background-color: yellow;">text <img src="none" alt="none" /> <b> <em> t e <strong> x </strong> t</em></b></span> 171<span style="background-color: yellow;">text <img src="none" alt="none" /> <b> <em> t e <strong> x </strong> t</em></b></span>
172<script>script</script> 172<script>script</script>
173<div> 173<div>
174 <pre id="none">p <a>r</a> e <!-- comment --> </pre> 174 <pre id="none">p <a>r</a> e <!-- comment --> </pre>
175 <pre> 175 <pre>
176 pre 176 pre
177 </pre> 177 </pre>
178</div> 178</div>
179<div><div><table border="1" style="background-color: red;"><tr><td>Cell</td><td colspan="2" rowspan="2"><table border="1" style="background-color: green;"><tr><td>Cell</td><td colspan="2" rowspan="2"></td></tr><tr><td>Cell</td></tr><tr><td>Cell</td><td>Cell</td><td>Cell</td></tr></table></td></tr><tr><td>Cell</td></tr><tr><td>Cell</td><td>Cell</td><td>Cell</td></tr></table></div></div> 179<div><div><table border="1" style="background-color: red;"><tr><td>Cell</td><td colspan="2" rowspan="2"><table border="1" style="background-color: green;"><tr><td>Cell</td><td colspan="2" rowspan="2"></td></tr><tr><td>Cell</td></tr><tr><td>Cell</td><td>Cell</td><td>Cell</td></tr></table></td></tr><tr><td>Cell</td></tr><tr><td>Cell</td><td>Cell</td><td>Cell</td></tr></table></div></div>
180(try to compact or beautify) 180(try to compact or beautify)
181 181
182<h6>Forms</h6> 182<h6>Forms</h6>
183 183
184(note nesting of 'form', missing required attributes, etc.)<br /> 184(note nesting of 'form', missing required attributes, etc.)<br />
185<form> 185<form>
186<script type="text/javascript">s</script> 186<script type="text/javascript">s</script>
187<fieldset><legend>p</legend>l <input name="personal_lastname" type="text" tabindex="1"></fieldset> 187<fieldset><legend>p</legend>l <input name="personal_lastname" type="text" tabindex="1"></fieldset>
188<input name="h" type="checkbox" value="h" tabindex="20"> h 188<input name="h" type="checkbox" value="h" tabindex="20"> h
189<textarea name="t">t</textarea> 189<textarea name="t">t</textarea>
190<form action="a" method="get"></form></form><br /> 190<form action="a" method="get"></form></form><br />
191<form action="b" method="get"><p><input type="text" value="i" /></form><br /> 191<form action="b" method="get"><p><input type="text" value="i" /></form><br />
192<form>B:<input type="text" value="b" />C:<input type="text" value="c" /></form><br /> 192<form>B:<input type="text" value="b" />C:<input type="text" value="c" /></form><br />
193(try each of these lines separately)<br /> 193(try each of these lines separately)<br />
194<form action="a">what<br /> 194<form action="a">what<br />
195<form action="a">what 195<form action="a">what
196(try with container as div and as form)<br /> 196(try with container as div and as form)<br />
197<form>c <a>a</a> <b>b</b><input /><script>s</script> 197<form>c <a>a</a> <b>b</b><input /><script>s</script>
198 198
199<h6>HTML comments (also CDATA)</h6> 199<h6>HTML comments (also CDATA)</h6>
200 200
201<strong>Script inside:</strong> <!--[if gte IE 4]> 201<strong>Script inside:</strong> <!--[if gte IE 4]>
202<SCRIPT>alert('XSS');</SCRIPT> 202<SCRIPT>alert('XSS');</SCRIPT>
203<![endif]--><br /> 203<![endif]--><br />
204<strong>Special characters inside: <!-- <![CDATA check ]]> -->, <!-- 3 < 4 > 3.5, & 4 &gt; 4 -->, <!-- che--ck -->, <!--[if !IE]> <--><a>c</a><!--> <![endif]--><br /> 204<strong>Special characters inside: <!-- <![CDATA check ]]> -->, <!-- 3 < 4 > 3.5, & 4 &gt; 4 -->, <!-- che--ck -->, <!--[if !IE]> <--><a>c</a><!--> <![endif]--><br />
205<strong>Normal:</strong> <!-- check -->, <!--check -->, <em>comment:<!-- check --></em><!-- check -->, <table><!-- check --><tr><td>text not allowed</td></tr></table><br /> 205<strong>Normal:</strong> <!-- check -->, <!--check -->, <em>comment:<!-- check --></em><!-- check -->, <table><!-- check --><tr><td>text not allowed</td></tr></table><br />
206<strong>Malformed:</strong> <![cdata check ]]>, < ![CDATA check ]]>, < ![CDATA check ] ]><br /> 206<strong>Malformed:</strong> <![cdata check ]]>, < ![CDATA check ]]>, < ![CDATA check ] ]><br />
207Invalid:</strong> <em <!-- check -->>comment in tag content</em>, <!--check--> 207Invalid:</strong> <em <!-- check -->>comment in tag content</em>, <!--check-->
208 208
209<h6>HTML5</h6> 209<h6>HTML5</h6>
210 210
211<strong>figure and figcaption:</strong> <figure><img src="picture.jpg" alt="picture"><figcaption>Caption for the awesome picture</figcaption></figure> 211<strong>figure and figcaption:</strong> <figure><img src="picture.jpg" alt="picture"><figcaption>Caption for the awesome picture</figcaption></figure>
212<strong>article:</strong> <h1>A</h1><p>B</p><article><h2>C</h2></article><article><h2>E</h2><p>F</p><p>G</p></article> 212<strong>article:</strong> <h1>A</h1><p>B</p><article><h2>C</h2></article><article><h2>E</h2><p>F</p><p>G</p></article>
213<strong>meter</strong>: <p>Heat <meter min="100" max="200" value="150">150</meter>.</p> 213<strong>meter</strong>: <p>Heat <meter min="100" max="200" value="150">150</meter>.</p>
214<strong>datalist</strong>: <input list="b" /><datalist id="b"><option value="c"><option value="d"></datalist> 214<strong>datalist</strong>: <input list="b" /><datalist id="b"><option value="c"><option value="d"></datalist>
215 215
216<h6>Ins-Del</h6> 216<h6>Ins-Del</h6>
217 217
218(depending on context, these elements can be of either block or inline type)<br /> 218(depending on context, these elements can be of either block or inline type)<br />
219<p><ins datetime="d" cite="c"><div>block</div></ins></p><br /> 219<p><ins datetime="d" cite="c"><div>block</div></ins></p><br />
220<p><del>d</del></p><br /> 220<p><del>d</del></p><br />
221<p><ins><del>d</del></ins></p><div><ins><p><del><div>d</div></del></p></ins></div><ins><div>d</div></ins> 221<p><ins><del>d</del></ins></p><div><ins><p><del><div>d</div></del></p></ins></div><ins><div>d</div></ins>
222 222
223<h6>Lists</h6> 223<h6>Lists</h6>
224 224
225<strong>Invalid character data</strong>: <ul><li>(item</li>)</ul><br /> 225<strong>Invalid character data</strong>: <ul><li>(item</li>)</ul><br />
226<strong>Definition list</strong>: <dl><dt>a</dt>bad<dd>first <em>one</em></dd><dt>b</dt><dd>second</dd></dl><br /> 226<strong>Definition list</strong>: <dl><dt>a</dt>bad<dd>first <em>one</em></dd><dt>b</dt><dd>second</dd></dl><br />
227<strong>Definition list, close-tags omitted</strong>: <dl><dt>a</dt>bad<dd>first <em>one</em></dd><dt>b<dd>second</dl><br /> 227<strong>Definition list, close-tags omitted</strong>: <dl><dt>a</dt>bad<dd>first <em>one</em></dd><dt>b<dd>second</dl><br />
228<strong>Definition lists, nested</strong>: <dl> 228<strong>Definition lists, nested</strong>: <dl>
229 <dt>T1</dt> 229 <dt>T1</dt>
230 <dd>D1</dd> 230 <dd>D1</dd>
231 <dt>T2</dt> 231 <dt>T2</dt>
232 <dd>D2<dl><dt>t1</dt><dd>d1</dd><dt>t2</dt><dd>d2</dd></dl></dd> 232 <dd>D2<dl><dt>t1</dt><dd>d1</dd><dt>t2</dt><dd>d2</dd></dl></dd>
233 <dt>T3</dt> 233 <dt>T3</dt>
234 <dd>D3</dd> 234 <dd>D3</dd>
235 <dt>T4</dt> 235 <dt>T4</dt>
236 <dd>D4<dl><dt>t1</dt><dd>d1</dd></dl></dd> 236 <dd>D4<dl><dt>t1</dt><dd>d1</dd></dl></dd>
237</dl><br /> 237</dl><br />
238<strong>Definition lists, nested, close-tags omitted</strong>: <dl> 238<strong>Definition lists, nested, close-tags omitted</strong>: <dl>
239 <dt>T1 239 <dt>T1
240 <dd>D1</dd> 240 <dd>D1</dd>
241 <dt>T2</dt> 241 <dt>T2</dt>
242 <dd>D2<dl><dt>t1<dd>d1<dt>t2</dt><dd>d2</dd></dl></dd> 242 <dd>D2<dl><dt>t1<dd>d1<dt>t2</dt><dd>d2</dd></dl></dd>
243 <dt>T3 243 <dt>T3
244 <dd>D3 244 <dd>D3
245 <dt>T4 245 <dt>T4
246 <dd>D4<dl><dt>t1<dd>d1</dl></dd> 246 <dd>D4<dl><dt>t1<dd>d1</dl></dd>
247</dl><br /> 247</dl><br />
248<strong>Nested</strong>: <ul> 248<strong>Nested</strong>: <ul>
249 <li>l1</li> 249 <li>l1</li>
250 <li>l2<ol><li>lo1</li><li>lo2</li></ol></li> 250 <li>l2<ol><li>lo1</li><li>lo2</li></ol></li>
251 <li>l3</li> 251 <li>l3</li>
252 <li>l4<ol><li>lo3</li><li>lo4<ol><li>lo5</li></ol></li></ol></li> 252 <li>l4<ol><li>lo3</li><li>lo4<ol><li>lo5</li></ol></li></ol></li>
253</ul><br /> 253</ul><br />
254<strong>Nested, directly</strong>: <ul> 254<strong>Nested, directly</strong>: <ul>
255 <li>l1</li> 255 <li>l1</li>
256 <ol>l2</ol> 256 <ol>l2</ol>
257 <li>l3</li> 257 <li>l3</li>
258</ul><br /> 258</ul><br />
259<strong>Nested, close-tags omitted</strong>: <ul> 259<strong>Nested, close-tags omitted</strong>: <ul>
260 <li>l1</li> 260 <li>l1</li>
261 <li>l2<ol><li>lo1<li>lo2</ol> 261 <li>l2<ol><li>lo1<li>lo2</ol>
262 <li>l3 262 <li>l3
263 <li>l4<ol><li>lo3<li>lo4<ol><li>lo5</ol></ol> 263 <li>l4<ol><li>lo3<li>lo4<ol><li>lo5</ol></ol>
264</ul><br /> 264</ul><br />
265<strong>Complex</strong>: 265<strong>Complex</strong>:
266<ol><script></script><li><table><tr><td> 266<ol><script></script><li><table><tr><td>
267<ul><li id="search" class="widget widget_search"> <form id="searchform" method="get" action="http://kohei.us"> 267<ul><li id="search" class="widget widget_search"> <form id="searchform" method="get" action="http://kohei.us">
268 <div> 268 <div>
269 269
270 <input type="text" name="s" id="s" size="15" /><br /> 270 <input type="text" name="s" id="s" size="15" /><br />
271 <input type="submit" value="Search" /> 271 <input type="submit" value="Search" />
272 </div> 272 </div>
273 </form> 273 </form>
274 </li></ul> 274 </li></ul>
275</td></tr></table></li></ol> 275</td></tr></table></li></ol>
276<strong>Menu</strong>: <menu type="toolbar"><li><menu label="File"> 276<strong>Menu</strong>: <menu type="toolbar"><li><menu label="File">
277 <button type="button" onclick="new()">New...</button> 277 <button type="button" onclick="new()">New...</button>
278 </menu></li><li><menu label="Edit"><button type="button" onclick="cut()">Cut...</button></menu></li> 278 </menu></li><li><menu label="Edit"><button type="button" onclick="cut()">Cut...</button></menu></li>
279 </menu> 279 </menu>
280 280
281<h6>Microdata</h6> 281<h6>Microdata</h6>
282 282
283<div itemscope itemtype="http://data-vocabulary.org/Person"> 283<div itemscope itemtype="http://data-vocabulary.org/Person">
284I am <span itemprop="name">X</span> but people call me <span itemprop="nickname">Y</span>. 284I am <span itemprop="name">X</span> but people call me <span itemprop="nickname">Y</span>.
285Find me at <a href="http://www.xy.com" itemprop="url">www.xy.com</a> 285Find me at <a href="http://www.xy.com" itemprop="url">www.xy.com</a>
286</div> 286</div>
287 287
288<h6>Microsoft Word</h6> 288<h6>Microsoft Word</h6>
289 289
290<strong>Proprietary tag</strong>: <p class=3DMsoNormal><o:p>&nbsp;</o:p></p><br /> 290<strong>Proprietary tag</strong>: <p class=3DMsoNormal><o:p>&nbsp;</o:p></p><br />
291<strong>XML declaration</strong>: <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /><br /> 291<strong>XML declaration</strong>: <?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /><br />
292<strong>XML-invalid character code-point (may not replicate)</strong>: <p class=3DMsoNormal>“Where is he?” asked both Mary – the one so lovely – and Jane.</p> 292<strong>XML-invalid character code-point (may not replicate)</strong>: <p class=3DMsoNormal>“Where is he?” asked both Mary – the one so lovely – and Jane.</p>
293 293
294<h6>Nesting</h6> 294<h6>Nesting</h6>
295 295
296<strong>Block or inline a</strong>: <p><a href="link">text</a></p><a href="link"><div>hi</div></a><br /> 296<strong>Block or inline a</strong>: <p><a href="link">text</a></p><a href="link"><div>hi</div></a><br />
297 297
298<h6>Non-English text-1</h6> 298<h6>Non-English text-1</h6>
299 299
300Inscrieţi-vă acum la a Zecea Conferinţă Internaţională<br /> 300Inscrieţi-vă acum la a Zecea Conferinţă Internaţională<br />
301გთხოვთ ახლავე გაიაროთ რეგისტრაცია<br /> 301გთხოვთ ახლავე გაიაროთ რეგისტრაცია<br />
302večjezično računalništvo<br /> 302večjezično računalništvo<br />
303<a title="อ.อ่าง">อ.อ่าง</a><br /> 303<a title="อ.อ่าง">อ.อ่าง</a><br />
304<a title="הירשמו 304<a title="הירשמו
305כעת לכנס ">Зарегистрируйтесь сейчас 305כעת לכנס ">Зарегистрируйтесь сейчас
306на Десятую Международную Конференцию по</a><br /> 306на Десятую Международную Конференцию по</a><br />
307(this file should have utf-8 encoding; some characters may not be displayed because of missing fonts, etc.) 307(this file should have utf-8 encoding; some characters may not be displayed because of missing fonts, etc.)
308 308
309<h6>Non-English text-2: entities</h6> 309<h6>Non-English text-2: entities</h6>
310 310
311&#29992;&#32479;&#19968;&#30721;<br /> 311&#29992;&#32479;&#19968;&#30721;<br />
312&#4306;&#4311;&#4334;&#4317;&#4309;&#4311;<br /> 312&#4306;&#4311;&#4334;&#4317;&#4309;&#4311;<br />
313Inscreva-se agora para a D&#233;cima Confer&#234;ncia Internacional Sobre O Unicode, realizada entre os dias 10 e 12 de mar&#231;o de 1997 em Mainz 313Inscreva-se agora para a D&#233;cima Confer&#234;ncia Internacional Sobre O Unicode, realizada entre os dias 10 e 12 de mar&#231;o de 1997 em Mainz
314na Alemanha. 314na Alemanha.
315 315
316<h6>Ruby</h6> 316<h6>Ruby</h6>
317 317
318(need compatible browser)<br /> 318(need compatible browser)<br />
319<ruby xml:lang="ja"> 319<ruby xml:lang="ja">
320 <rbc> 320 <rbc>
321 <rb>斎</rb> 321 <rb>斎</rb>
322 <rb>藤</rb> 322 <rb>藤</rb>
323 <rb>信</rb> 323 <rb>信</rb>
324 <rb>男</rb> 324 <rb>男</rb>
325 </rbc> 325 </rbc>
326 <rtc class="reading"> 326 <rtc class="reading">
327 <rt>さい</rt> 327 <rt>さい</rt>
328 <rt>とう</rt> 328 <rt>とう</rt>
329 <rt>のぶ</rt> 329 <rt>のぶ</rt>
330 <rt>お</rt> 330 <rt>お</rt>
331 </rtc> 331 </rtc>
332 <rtc class="annotation"> 332 <rtc class="annotation">
333 <rt rbspan="4" xml:lang="en">W3C Associate Chairman</rt> 333 <rt rbspan="4" xml:lang="en">W3C Associate Chairman</rt>
334 </rtc> 334 </rtc>
335</ruby><br /> 335</ruby><br />
336<ruby> 336<ruby>
337 <rb>WWW</rb> 337 <rb>WWW</rb>
338 <rp>(</rp><rt>World Wide Web</rt><rp>)</rp> 338 <rp>(</rp><rt>World Wide Web</rt><rp>)</rp>
339</ruby><br /> 339</ruby><br />
340<ruby> 340<ruby>
341 A 341 A
342 <rp>(</rp><rt>aaa</rt><rp>)</rp> 342 <rp>(</rp><rt>aaa</rt><rp>)</rp>
343</ruby> 343</ruby>
344 344
345 345
346<h6>Tables</h6> 346<h6>Tables</h6>
347 347
348<strong>Omitted closing tags:</strong> <table> 348<strong>Omitted closing tags:</strong> <table>
349<colgroup><col style="x" /><col style="y" /> 349<colgroup><col style="x" /><col style="y" />
350<thead> 350<thead>
351<tr><th>h1c1<th>h1c2 351<tr><th>h1c1<th>h1c2
352<tbody> 352<tbody>
353<tr><td>r1c1<td>r1c2 353<tr><td>r1c1<td>r1c2
354<tr><td>r2c1<td>r2c2 354<tr><td>r2c1<td>r2c2
355</table><br /> 355</table><br />
356<strong>Nested, omitted closing tags:</strong> <table> 356<strong>Nested, omitted closing tags:</strong> <table>
357<colgroup><col style="x" /><col style="y" /> 357<colgroup><col style="x" /><col style="y" />
358<thead> 358<thead>
359<tr><th>h1c1<th>h1c2 359<tr><th>h1c1<th>h1c2
360<tbody> 360<tbody>
361<tr><td>r1c1<td>r1c2<table> 361<tr><td>r1c1<td>r1c2<table>
362<colgroup><col style="x" /><col style="y" /> 362<colgroup><col style="x" /><col style="y" />
363<thead> 363<thead>
364<tr><th>h1c1<th>h1c2 364<tr><th>h1c1<th>h1c2
365<tbody> 365<tbody>
366<tr><td>r1c1<td>r1c2 366<tr><td>r1c1<td>r1c2
367<tr><td>r2c1<td>r2c2 367<tr><td>r2c1<td>r2c2
368</table> 368</table>
369<tr><td>r2c1<td>r2c2 369<tr><td>r2c1<td>r2c2
370</table><br /> 370</table><br />
371 371
372<h6>Tag transformation</h6> 372<h6>Tag transformation</h6>
373<strong>Font element with malicious code:</strong> <p><font color="z-index:123;width:100%;height:100%;position:fixed;top:0;left:0;background-size:cover;background-attachment:fixed;background-image:url(https://i.imgur.com/VQ30s65.png)"></font></p><br /> 373<strong>Font element with malicious code:</strong> <p><font color="z-index:123;width:100%;height:100%;position:fixed;top:0;left:0;background-size:cover;background-attachment:fixed;background-image:url(https://i.imgur.com/VQ30s65.png)"></font></p><br />
374<strong>Font element intended as 'inline' element:</strong> <p><font color='red'>hi</font></p><br /> 374<strong>Font element intended as 'inline' element:</strong> <p><font color='red'>hi</font></p><br />
375<strong>Font element intended as 'block' element:</strong> <div><font color='red'><div>hi</div></font></div><br /> 375<strong>Font element intended as 'block' element:</strong> <div><font color='red'><div>hi</div></font></div><br />
376<strong>Font element intended as 'block' element:</strong> <center><font color='red' face="serif, 'Times'"><div>hi</div><div>QQQ</div></font></center><br /> 376<strong>Font element intended as 'block' element:</strong> <center><font color='red' face="serif, 'Times'"><div>hi</div><div>QQQ</div></font></center><br />
377 377
378<h6>Tidy</h6> 378<h6>Tidy</h6>
379<strong>White-space handling:</strong> abc<em> def </em> ghi abc <em>def</em> ghi 379<strong>White-space handling:</strong> abc<em> def </em> ghi abc <em>def</em> ghi
380 380
381<h6>URLs</h6> 381<h6>URLs</h6>
382 382
383<strong>Relative and absolute:</strong> <a href="mailto:x"></a>, <a href="http://a.com/b/c/d.f"></a>, <a href="./../d.f"></a>, <a href="./d.f"></a>, <a href="d.f"></a>, <a href="#s"></a>, <a href="./../../d.f#s"></a><br /> 383<strong>Relative and absolute:</strong> <a href="mailto:x"></a>, <a href="http://a.com/b/c/d.f"></a>, <a href="./../d.f"></a>, <a href="./d.f"></a>, <a href="d.f"></a>, <a href="#s"></a>, <a href="./../../d.f#s"></a><br />
384(try base URL value of 'http://a.com/b/')<br /> 384(try base URL value of 'http://a.com/b/')<br />
385<strong>CSS URLs:</strong> <div style="background-image: url('a.gif');"></div>, <div style="background-image: URL(&quot;a.gif&quot;);"></div>, <div style="background-image: url('http://a.com/a.gif');"></div>, <div style="background-image: url('./../a.gif');"></div>, <div style="background-image: &#117;r&#x6C;('js&#58;xss'&#x29;"></div><br /> 385<strong>CSS URLs:</strong> <div style="background-image: url('a.gif');"></div>, <div style="background-image: URL(&quot;a.gif&quot;);"></div>, <div style="background-image: url('http://a.com/a.gif');"></div>, <div style="background-image: url('./../a.gif');"></div>, <div style="background-image: &#117;r&#x6C;('js&#58;xss'&#x29;"></div><br />
386<strong>Double URLs:</strong> <a style="behaviour: url(foo) url(http://example.com/xss.htc)">b</a><br /> 386<strong>Double URLs:</strong> <a style="behaviour: url(foo) url(http://example.com/xss.htc)">b</a><br />
387<strong>Anti-spam:</strong> (try regex for 'http://a.com', etc.) <a href="mailto:x@y.com"></a>, <a href="http://a.com/b@d.f"></a>, <a href="a.com/d.f" rel="nofollow"></a>, <a href="a.com/d.f" rel="1, 2"></a>, <a href="a.com/d.f"></a>, <a href="b.com/d.f"></a>, <a href="c.com/d.f">, <a href="denied:http://c.com/d.f"></a><br /> 387<strong>Anti-spam:</strong> (try regex for 'http://a.com', etc.) <a href="mailto:x@y.com"></a>, <a href="http://a.com/b@d.f"></a>, <a href="a.com/d.f" rel="nofollow"></a>, <a href="a.com/d.f" rel="1, 2"></a>, <a href="a.com/d.f"></a>, <a href="b.com/d.f"></a>, <a href="c.com/d.f">, <a href="denied:http://c.com/d.f"></a><br />
388<strong>Soft-hyphen:</strong> <a href="http://q=ídis­c">ídis­c</a> 388<strong>Soft-hyphen:</strong> <a href="http://q=ídis­c">ídis­c</a>
389 389
390<h6>XSS</h6> 390<h6>XSS</h6>
391 391
392<img alt="<img onmouseover=confirm(1)//"<""> 392<img alt="<img onmouseover=confirm(1)//"<"">
393'';!--"<xss>=&{()}<br /> 393'';!--"<xss>=&{()}<br />
394<img src="javascript%3Aalert('xss');" /><br /> 394<img src="javascript%3Aalert('xss');" /><br />
395<img src="javascript:alert('xss');" /><br /> 395<img src="javascript:alert('xss');" /><br />
396<img src="java script:alert('xss');" /><br /> 396<img src="java script:alert('xss');" /><br />
397<img 397<img
398src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41; /><br /> 398src=&#106;&#97;&#118;&#97;&#115;&#99;&#114;&#105;&#112;&#116;&#58;&#97;&#108;&#101;&#114;&#116;&#40;&#39;&#88;&#83;&#83;&#39;&#41; /><br />
399<font color='#FF6699"onmouseover="alert(1)//'>test</font> 399<font color='#FF6699"onmouseover="alert(1)//'>test</font>
400<font color='<img//onerror="alert`www.ptsecurity.com`"src=Psych0tr1a'> 400<font color='<img//onerror="alert`www.ptsecurity.com`"src=Psych0tr1a'>
401<div style="javascript:alert('xss');"></div><br /> 401<div style="javascript:alert('xss');"></div><br />
402<div style="background-image:url(javascript:alert('xss'));"></div><br /> 402<div style="background-image:url(javascript:alert('xss'));"></div><br />
403<div style="background-image:url(&quot;javascript:alert('xss')&quot; );"></div><br /> 403<div style="background-image:url(&quot;javascript:alert('xss')&quot; );"></div><br />
404<!--[if gte IE 4]><script>alert('xss');</script><![endif]--><br /> 404<!--[if gte IE 4]><script>alert('xss');</script><![endif]--><br />
405<script a=">" src="http://ha.ckers.org/xss.js"></script><br /> 405<script a=">" src="http://ha.ckers.org/xss.js"></script><br />
406<div style="background-image: &#117;r&#x6C;('js&#58;xss'&#x29;"></div><br /> 406<div style="background-image: &#117;r&#x6C;('js&#58;xss'&#x29;"></div><br />
407<a style=";-moz-binding:url(http://lukasz.pilorz.net/xss/xss.xml#xss)" href="http://example.com">test</a><br /> 407<a style=";-moz-binding:url(http://lukasz.pilorz.net/xss/xss.xml#xss)" href="http://example.com">test</a><br />
408<strong>Bad IE7:</strong> <a href="http://x&x=%22+style%3d%22background-image%3a+expression%28alert 408<strong>Bad IE7:</strong> <a href="http://x&x=%22+style%3d%22background-image%3a+expression%28alert
409%28%27xss%3f%29%29">x</a><br /> 409%28%27xss%3f%29%29">x</a><br />
410<strong>Opera:</strong> <a href="\xE2\x80\x83javascript:alert(123)">link</a> 410<strong>Opera:</strong> <a href="\xE2\x80\x83javascript:alert(123)">link</a>
411<strong>Bad IE7:</strong> <a style=color:expr/*comment*/ession(alert(document.domain))>xxx</a><br /> 411<strong>Bad IE7:</strong> <a style=color:expr/*comment*/ession(alert(document.domain))>xxx</a><br />
412<strong>Bad IE7:</strong> <a href="xxx" style="background: exp&#x72;ession(alert('xss'));">xxx</a><br /> 412<strong>Bad IE7:</strong> <a href="xxx" style="background: exp&#x72;ession(alert('xss'));">xxx</a><br />
413<strong>Bad IE7:</strong> <a href="xxx" style="background: &#101;xpression(alert('xss'));">xxx</a><br /> 413<strong>Bad IE7:</strong> <a href="xxx" style="background: &#101;xpression(alert('xss'));">xxx</a><br />
414<strong>Bad IE7:</strong> <a href="xxx" style="background: %45xpression(alert('xss'));">xxx</a><br /> 414<strong>Bad IE7:</strong> <a href="xxx" style="background: %45xpression(alert('xss'));">xxx</a><br />
415<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/expression(alert('xss'));">xxx</a><br /> 415<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/expression(alert('xss'));">xxx</a><br />
416<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/&#69;xpression(alert('xss'));">xxx</a><br /> 416<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/&#69;xpression(alert('xss'));">xxx</a><br />
417<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/Exp&#x72;ession(alert('xss'));">xxx</a><br /> 417<strong>Bad IE7:</strong> <a href="xxx" style="background:/**/Exp&#x72;ession(alert('xss'));">xxx</a><br />
418<strong>Bad IE7:</strong> <a href="xxx" style="background: expr%45ssion(alert('xss'));">xxx</a><br /> 418<strong>Bad IE7:</strong> <a href="xxx" style="background: expr%45ssion(alert('xss'));">xxx</a><br />
419<strong>Bad IE7:</strong> <a href="xxx" style="background: exp/* */ression(alert('xss'));">xxx</a><br /> 419<strong>Bad IE7:</strong> <a href="xxx" style="background: exp/* */ression(alert('xss'));">xxx</a><br />
420<strong>Bad IE7:</strong> <a href="xxx" style="background: exp /* */ression(alert('xss'));">xxx</a><br /> 420<strong>Bad IE7:</strong> <a href="xxx" style="background: exp /* */ression(alert('xss'));">xxx</a><br />
421<strong>Bad IE7:</strong> <a href="xxx" style="background: exp/ * * /ression(alert('xss'));">xxx</a><br /> 421<strong>Bad IE7:</strong> <a href="xxx" style="background: exp/ * * /ression(alert('xss'));">xxx</a><br />
422<strong>Bad IE7:</strong> <a href="xxx" style="background:/* x */expression(alert('xss'));">xxx</a><br /> 422<strong>Bad IE7:</strong> <a href="xxx" style="background:/* x */expression(alert('xss'));">xxx</a><br />
423<strong>Bad IE7:</strong> <a href="xxx" style="background:/* */ */expression(alert('xss'));">xxx</a><br /> 423<strong>Bad IE7:</strong> <a href="xxx" style="background:/* */ */expression(alert('xss'));">xxx</a><br />
424<strong>Bad IE7:</strong> <a href="x" style="width: /****/**;;;;;;*/expression/**/(alert('xss'));">x</a><br /> 424<strong>Bad IE7:</strong> <a href="x" style="width: /****/**;;;;;;*/expression/**/(alert('xss'));">x</a><br />
425<strong>Bad IE7:</strong> <a href="x" style="padding:10px; background:/**/expression(alert('xss'));">x</a><br /> 425<strong>Bad IE7:</strong> <a href="x" style="padding:10px; background:/**/expression(alert('xss'));">x</a><br />
426<strong>Bad IE7:</strong> <a href="x" style="background: huh /* */ */expression(alert('xss'));">x</a><br /> 426<strong>Bad IE7:</strong> <a href="x" style="background: huh /* */ */expression(alert('xss'));">x</a><br />
427<strong>Bad IE7:</strong> <a href="x" style="background:/**/expression(alert('xss'));background:/**/expression(alert('xss'));">x</a><br /> 427<strong>Bad IE7:</strong> <a href="x" style="background:/**/expression(alert('xss'));background:/**/expression(alert('xss'));">x</a><br />
428<strong>Bad IE7:</strong> exp/*<a style='no\xss:noxss("*//*");xss:&#101;x&#x2F;*XSS*//*/*/pression(alert("XSS"))'>x</a><br /> 428<strong>Bad IE7:</strong> exp/*<a style='no\xss:noxss("*//*");xss:&#101;x&#x2F;*XSS*//*/*/pression(alert("XSS"))'>x</a><br />
429<strong>Bad IE7:</strong> <a style="background:&#69;xpre\ssion(alert('xss'));">hi</a><br /> 429<strong>Bad IE7:</strong> <a style="background:&#69;xpre\ssion(alert('xss'));">hi</a><br />
430<strong>Bad IE7:</strong> <a style="background:expre&#x5c;ssion(alert('xss'));">hi</a><br /> 430<strong>Bad IE7:</strong> <a style="background:expre&#x5c;ssion(alert('xss'));">hi</a><br />
431<strong>Bad IE7:</strong> <a style="color: \0065 \0078 \0070 \0072 \0065 \0073 \0073 \0069 \006f \006e \0028 \0061 \006c \0065 \0072 \0074 \0028 \0031 \0029 \0029">test</a><br /> 431<strong>Bad IE7:</strong> <a style="color: \0065 \0078 \0070 \0072 \0065 \0073 \0073 \0069 \006f \006e \0028 \0061 \006c \0065 \0072 \0074 \0028 \0031 \0029 \0029">test</a><br />
432<strong>Bad IE7:</strong> <a style="xss:e&#92;&#48;&#48;&#55;&#56;pression(window.x?0:(alert(/XSS/),window.x=1));">hi</a><br /> 432<strong>Bad IE7:</strong> <a style="xss:e&#92;&#48;&#48;&#55;&#56;pression(window.x?0:(alert(/XSS/),window.x=1));">hi</a><br />
433<strong>Bad IE7:</strong> <a style="background:url('java 433<strong>Bad IE7:</strong> <a style="background:url('java
434script:eval(document.all.mycode.expr)')">hi</a><br /> 434script:eval(document.all.mycode.expr)')">hi</a><br />
435 435
436<h6>Other</h6> 436<h6>Other</h6>
437 437
4383 < 4 <br /> 4383 < 4 <br />
4393 > 4 <br /> 4393 > 4 <br />
440 > 3 <br /> 440 > 3 <br />
441<._.> hi! <br /> 441<._.> hi! <br />
442<<< ALERT >>> <br /> 442<<< ALERT >>> <br />
443<![if !vml]> some stuff <![endif]> <br /> 443<![if !vml]> some stuff <![endif]> <br />
444<?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /> <br /> 444<?xml:namespace prefix = o ns = "urn:schemas-microsoft-com:office:office" /> <br />
445<uml:ns ns = "urn:www"> <br /> 445<uml:ns ns = "urn:www"> <br />
446<uml:ns ns = 'urn:www'> <br /> 446<uml:ns ns = 'urn:www'> <br />
447if(13<age AND 21>age){say 'teen'} <br /> 447if(13<age AND 21>age){say 'teen'} <br />
448age >51 and a smoking history of >51 pack-years <b>was</b> <br /> 448age >51 and a smoking history of >51 pack-years <b>was</b> <br />
449age > 51 and a smoking history of >51 pack-years <b>was</b> <br /> 449age > 51 and a smoking history of >51 pack-years <b>was</b> <br />
450age <51 and a smoking history of <51 pack-years <b>was</b> <br /> 450age <51 and a smoking history of <51 pack-years <b>was</b> <br />
451age < 51 and a smoking history of < 51 pack-years <b>was</b> <br /> 451age < 51 and a smoking history of < 51 pack-years <b>was</b> <br />
452<b>age >51 and a smoking history of >51 pack-years</b> <br /> 452<b>age >51 and a smoking history of >51 pack-years</b> <br />
453<b>age > 51 and a smoking history of >51 pack-years</b> <br /> 453<b>age > 51 and a smoking history of >51 pack-years</b> <br />
454<b>age <51 and a smoking history of <51 pack-years</b> <br /> 454<b>age <51 and a smoking history of <51 pack-years</b> <br />
455<b>age < 51 and a smoking history of < 51 pack-years</b> <br /> 455<b>age < 51 and a smoking history of < 51 pack-years</b> <br />