Adding url_filter_func configuration variable.

All URLs are now passed through the `encodeURLAttribute` function, which applies the filter as necessary. If the filter function changes the URL of an automatic link, this is reflected in the text of the link too.
Inline- and reference-style email links now benefit from the entity obfuscation since `mailto:` is now detected in `encodeURLAttribute` and it triggers the entity obfuscation.

Fixes #85.
This commit is contained in:
Michel Fortin 2014-08-10 15:25:52 -04:00
parent 123c43a65c
commit a8c56ecd5e

View file

@ -59,6 +59,9 @@ class Markdown implements MarkdownInterface {
public $predef_urls = array();
public $predef_titles = array();
# Optional filter function for URLs
public $url_filter_func = null;
### Parser Implementation ###
@ -593,7 +596,7 @@ class Markdown implements MarkdownInterface {
if (isset($this->urls[$link_id])) {
$url = $this->urls[$link_id];
$url = $this->encodeAttribute($url);
$url = $this->encodeURLAttribute($url);
$result = "<a href=\"$url\"";
if ( isset( $this->titles[$link_id] ) ) {
@ -623,7 +626,7 @@ class Markdown implements MarkdownInterface {
if ($unhashed != $url)
$url = preg_replace('/^<(.*)>$/', '\1', $unhashed);
$url = $this->encodeAttribute($url);
$url = $this->encodeURLAttribute($url);
$result = "<a href=\"$url\"";
if (isset($title)) {
@ -704,7 +707,7 @@ class Markdown implements MarkdownInterface {
$alt_text = $this->encodeAttribute($alt_text);
if (isset($this->urls[$link_id])) {
$url = $this->encodeAttribute($this->urls[$link_id]);
$url = $this->encodeURLAttribute($this->urls[$link_id]);
$result = "<img src=\"$url\" alt=\"$alt_text\"";
if (isset($this->titles[$link_id])) {
$title = $this->titles[$link_id];
@ -728,7 +731,7 @@ class Markdown implements MarkdownInterface {
$title =& $matches[7];
$alt_text = $this->encodeAttribute($alt_text);
$url = $this->encodeAttribute($url);
$url = $this->encodeURLAttribute($url);
$result = "<img src=\"$url\" alt=\"$alt_text\"";
if (isset($title)) {
$title = $this->encodeAttribute($title);
@ -1262,6 +1265,33 @@ class Markdown implements MarkdownInterface {
}
protected function encodeURLAttribute($url, &$text = null) {
#
# Encode text for a double-quoted HTML attribute containing a URL,
# applying the URL filter if set. Also generates the textual
# representation for the URL (removing mailto: or tel:) storing it in $text.
# This function is *not* suitable for attributes enclosed in single quotes.
#
if ($this->url_filter_func)
$url = call_user_func($this->url_filter_func, $url);
if (preg_match('{^mailto:}i', $url))
$url = $this->encodeEntityObfuscatedAttribute($url, $text, 7);
else if (preg_match('{^tel:}i', $url))
{
$url = $this->encodeAttribute($url);
$text = substr($url, 4);
}
else
{
$url = $this->encodeAttribute($url);
$text = $url;
}
return $url;
}
protected function encodeAmpsAndAngles($text) {
#
# Smart processing for ampersands and angle brackets that need to
@ -1284,7 +1314,7 @@ class Markdown implements MarkdownInterface {
protected function doAutoLinks($text) {
$text = preg_replace_callback('{<((https?|ftp|dict):[^\'">\s]+)>}i',
$text = preg_replace_callback('{<((https?|ftp|dict|tel):[^\'">\s]+)>}i',
array($this, '_doAutoLinks_url_callback'), $text);
# Email addresses: <address@domain.foo>
@ -1307,47 +1337,45 @@ class Markdown implements MarkdownInterface {
>
}xi',
array($this, '_doAutoLinks_email_callback'), $text);
$text = preg_replace_callback('{<(tel:([^\'">\s]+))>}i',array($this, '_doAutoLinks_tel_callback'), $text);
return $text;
}
protected function _doAutoLinks_tel_callback($matches) {
$url = $this->encodeAttribute($matches[1]);
$tel = $this->encodeAttribute($matches[2]);
$link = "<a href=\"$url\">$tel</a>";
return $this->hashPart($link);
}
protected function _doAutoLinks_url_callback($matches) {
$url = $this->encodeAttribute($matches[1]);
$link = "<a href=\"$url\">$url</a>";
$url = $this->encodeURLAttribute($matches[1], $text);
$link = "<a href=\"$url\">$text</a>";
return $this->hashPart($link);
}
protected function _doAutoLinks_email_callback($matches) {
$address = $matches[1];
$link = $this->encodeEmailAddress($address);
$addr = $matches[1];
$url = $this->encodeURLAttribute("mailto:$addr", $text);
$link = "<a href=\"$url\">$text</a>";
return $this->hashPart($link);
}
protected function encodeEmailAddress($addr) {
protected function encodeEntityObfuscatedAttribute($text, &$tail = null, $head_length = 0) {
#
# Input: an email address, e.g. "foo@example.com"
# Input: some text to obfuscate, e.g. "mailto:foo@example.com"
#
# Output: the email address as a mailto link, with each character
# of the address encoded as either a decimal or hex entity, in
# the hopes of foiling most address harvesting spam bots. E.g.:
# Output: the same text but with most characters encoded as either a
# decimal or hex entity, in the hopes of foiling most address
# harvesting spam bots. E.g.:
#
# <p><a href="&#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
# &#109;&#x61;&#105;&#x6c;&#116;&#x6f;&#58;&#x66;o&#111;
# &#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;&#101;&#46;&#x63;&#111;
# &#x6d;">&#x66;o&#111;&#x40;&#101;&#x78;&#97;&#x6d;&#112;&#x6c;
# &#101;&#46;&#x63;&#111;&#x6d;</a></p>
# &#x6d;
#
# Note: the additional output $tail is assigned the same value as the
# ouput, minus the number of characters specified by $head_length.
#
# Based by a filter by Matthew Wickline, posted to BBEdit-Talk.
# With some optimizations by Milian Wolff.
# With some optimizations by Milian Wolff. Forced encoding of HTML
# attribute special characters by Allan Odgaard.
#
$addr = "mailto:" . $addr;
$chars = preg_split('/(?<!^)(?!$)/', $addr);
$seed = (int)abs(crc32($addr) / strlen($addr)); # Deterministic seed.
if ($text == "") return $tail = "";
$chars = preg_split('/(?<!^)(?!$)/', $text);
$seed = (int)abs(crc32($text) / strlen($text)); # Deterministic seed.
foreach ($chars as $key => $char) {
$ord = ord($char);
@ -1356,18 +1384,17 @@ class Markdown implements MarkdownInterface {
$r = ($seed * (1 + $key)) % 100; # Pseudo-random function.
# roughly 10% raw, 45% hex, 45% dec
# '@' *must* be encoded. I insist.
# '"' has to be encoded inside the attribute
if ($r > 90 && $char != '@' && $char != '"') /* do nothing */;
# '"' and '>' have to be encoded inside the attribute
if ($r > 90 && strpos('@"&>', $char) === false) /* do nothing */;
else if ($r < 45) $chars[$key] = '&#x'.dechex($ord).';';
else $chars[$key] = '&#'.$ord.';';
}
}
$addr = implode('', $chars);
$text = implode('', array_slice($chars, 7)); # text without `mailto:`
$addr = "<a href=\"$addr\">$text</a>";
$text = implode('', $chars);
$tail = $head_length ? implode('', array_slice($chars, $head_length)) : $text;
return $addr;
return $text;
}
@ -2296,7 +2323,7 @@ abstract class _MarkdownExtra_TmpImpl extends \Michelf\Markdown {
if (isset($this->urls[$link_id])) {
$url = $this->urls[$link_id];
$url = $this->encodeAttribute($url);
$url = $this->encodeURLAttribute($url);
$result = "<a href=\"$url\"";
if ( isset( $this->titles[$link_id] ) ) {
@ -2329,7 +2356,7 @@ abstract class _MarkdownExtra_TmpImpl extends \Michelf\Markdown {
if ($unhashed != $url)
$url = preg_replace('/^<(.*)>$/', '\1', $unhashed);
$url = $this->encodeAttribute($url);
$url = $this->encodeURLAttribute($url);
$result = "<a href=\"$url\"";
if (isset($title)) {
@ -2412,7 +2439,7 @@ abstract class _MarkdownExtra_TmpImpl extends \Michelf\Markdown {
$alt_text = $this->encodeAttribute($alt_text);
if (isset($this->urls[$link_id])) {
$url = $this->encodeAttribute($this->urls[$link_id]);
$url = $this->encodeURLAttribute($this->urls[$link_id]);
$result = "<img src=\"$url\" alt=\"$alt_text\"";
if (isset($this->titles[$link_id])) {
$title = $this->titles[$link_id];
@ -2439,7 +2466,7 @@ abstract class _MarkdownExtra_TmpImpl extends \Michelf\Markdown {
$attr = $this->doExtraAttributes("img", $dummy =& $matches[8]);
$alt_text = $this->encodeAttribute($alt_text);
$url = $this->encodeAttribute($url);
$url = $this->encodeURLAttribute($url);
$result = "<img src=\"$url\" alt=\"$alt_text\"";
if (isset($title)) {
$title = $this->encodeAttribute($title);