From 60be7c207e2fde8b230936999c7b9cbb94894658 Mon Sep 17 00:00:00 2001 From: Michel Fortin Date: Tue, 14 Aug 2007 16:15:40 -0400 Subject: [PATCH] Sync with Markdown.pl 1.0.2b7 and other fixes --- License.text | 2 +- PHP Markdown Readme.text | 20 +- markdown.php | 725 +++++++++++++++++++++++++++------------ 3 files changed, 505 insertions(+), 242 deletions(-) diff --git a/License.text b/License.text index e18fc09..0eafc74 100644 --- a/License.text +++ b/License.text @@ -1,4 +1,4 @@ -Copyright (c) 2004-2005, John Gruber +Copyright (c) 2004-2006, John Gruber All rights reserved. diff --git a/PHP Markdown Readme.text b/PHP Markdown Readme.text index 101dff1..9bc20ca 100644 --- a/PHP Markdown Readme.text +++ b/PHP Markdown Readme.text @@ -1,7 +1,7 @@ PHP Markdown ============ -Version 1.0.1oo - Fri 19 May 2006 +Version 1.0.2b7 - Sat 16 Sep 2006 by Michel Fortin @@ -184,17 +184,6 @@ expected; (3) the output PHP Markdown actually produced. Version History --------------- -1.0.1oo (19 May 2006) - -* Converted PHP Markdown to a object-oriented design. - - -1.0.1c (9 Dec 2005) - -* Fixed a problem occurring with PHP 5.1.1 due to a small - change to strings variable replacement behaviour in - this version. - 1.0.1b (6 Jun 2005) @@ -237,9 +226,6 @@ Version History tampering with Markdown-formatted text. More details here: -* Added a configuration variable for WordPress that can disable the - Markdown filter on comments. - 1.0.1a (15 Apr 2005) @@ -262,7 +248,7 @@ Version History filter so that it runs after Markdown. -1.0.2b1 (5 Mar 2005) +1.0.2b1 - 5 Mar 2005 * Fix for backticks within HTML tag: @@ -441,7 +427,7 @@ Version History Copyright and License --------------------- -Copyright (c) 2004-2006 Michel Fortin +Copyright (c) 2004-2005 Michel Fortin All rights reserved. diff --git a/markdown.php b/markdown.php index 426e7fc..b5ae177 100644 --- a/markdown.php +++ b/markdown.php @@ -12,7 +12,7 @@ # -define( 'MARKDOWN_VERSION', "1.0.1oo" ); # Fri 19 May 2006 +define( 'MARKDOWN_VERSION', "1.0.2b7" ); # Sat 16 Sep 2006 # @@ -62,7 +62,7 @@ function Markdown($text) { Plugin Name: Markdown Plugin URI: http://www.michelf.com/projects/php-markdown/ Description: Markdown syntax allows you to write using an easy-to-read, easy-to-write plain text format. Based on the original Perl version by John Gruber. More... -Version: 1.0.1oo +Version: 1.0.2b7 Author: Michel Fortin Author URI: http://www.michelf.com/ */ @@ -215,6 +215,11 @@ class Markdown_Parser { $this->escape_table[$char] = $hash; $this->backslash_escape_table["\\$char"] = $hash; } + + # Sort document, block, and span gamut in ascendent priority order. + asort($this->document_gamut); + asort($this->block_gamut); + asort($this->span_gamut); } @@ -222,6 +227,7 @@ class Markdown_Parser { var $urls = array(); var $titles = array(); var $html_blocks = array(); + var $html_hashes = array(); # Contains both blocks and span hashes. function transform($text) { @@ -238,6 +244,7 @@ class Markdown_Parser { $this->urls = array(); $this->titles = array(); $this->html_blocks = array(); + $this->html_hashes = array(); # Standardize line endings: # DOS to Unix and Mac to Unix @@ -249,24 +256,30 @@ class Markdown_Parser { # Convert all tabs to spaces. $text = $this->detab($text); + # Turn block-level HTML blocks into hash entries + $text = $this->hashHTMLBlocks($text); + # Strip any lines consisting only of spaces and tabs. # This makes subsequent regexen easier to write, because we can # match consecutive blank lines with /\n+/ instead of something # contorted like /[ \t]*\n+/ . $text = preg_replace('/^[ \t]+$/m', '', $text); - # Turn block-level HTML blocks into hash entries - $text = $this->hashHTMLBlocks($text); - - # Strip link definitions, store in hashes. - $text = $this->stripLinkDefinitions($text); - - $text = $this->runBlockGamut($text); - - $text = $this->unescapeSpecialChars($text); + # Run document gamut methods. + foreach ($this->document_gamut as $method => $priority) { + $text = $this->$method($text); + } return $text . "\n"; } + + var $document_gamut = array( + # Strip link definitions, store in hashes. + "stripLinkDefinitions" => 20, + + "runBasicBlockGamut" => 30, + "unescapeSpecialChars" => 90, + ); function stripLinkDefinitions($text) { @@ -278,7 +291,7 @@ class Markdown_Parser { # Link defs are in the form: ^[id]: url "optional title" $text = preg_replace_callback('{ - ^[ ]{0,'.$less_than_tab.'}\[(.+)\]: # id = $1 + ^[ ]{0,'.$less_than_tab.'}\[(.+)\][ ]?: # id = $1 [ \t]* \n? # maybe *one* newline [ \t]* @@ -317,11 +330,48 @@ class Markdown_Parser { # "paragraphs" that are wrapped in non-block-level tags, such as anchors, # phrase emphasis, and spans. The list of tags we're looking for is # hard-coded: - $block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'. + $block_tags_a = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'. 'script|noscript|form|fieldset|iframe|math|ins|del'; - $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|'. + $block_tags_b = 'p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|address|'. 'script|noscript|form|fieldset|iframe|math'; + # Regular expression for the content of a block tag. + $nested_tags_level = 4; + $attr = ' + (?> # optional tag attributes + \s # starts with whitespace + (?> + [^>"/]+ # text outside quotes + | + /+(?!>) # slash not followed by ">" + | + "[^"]*" # text inside double quotes (tolerate ">") + | + \'[^\']*\' # text inside single quotes (tolerate ">") + )* + )? + '; + $content = + str_repeat(' + (?> + [^<]+ # content without tag + | + <\2 # nested opening tag + '.$attr.' # attributes + (?: + /> + | + >', $nested_tags_level). # end of opening tag + '.*?'. # last level nested tag content + str_repeat(' + # closing nested tag + ) + | + <(?!/\2\s*> # other tags with a different name + ) + )*', + $nested_tags_level); + # First, look for nested blocks, e.g.: #
#
@@ -333,34 +383,34 @@ class Markdown_Parser { # the inner nested divs must be indented. # We need to do this before the next, more liberal match, because the next # match will start at the first `
` and stop at the first `
`. - $text = preg_replace_callback("{ + $text = preg_replace_callback('{ ( # save in $1 ^ # start of line (with /m) - <($block_tags_a) # start tag = $2 - \\b # word break - (.*\\n)*? # any number of lines, minimally matching - # the matching end tag - [ \\t]* # trailing spaces/tabs - (?=\\n+|\\Z) # followed by a newline or end of document + <('.$block_tags_a.')# start tag = $2 + '.$attr.'>\n # attributes followed by > and \n + '.$content.' # content, support nesting + # the matching end tag + [ \t]* # trailing spaces/tabs + (?=\n+|\Z) # followed by a newline or end of document ) - }xm", + }xm', array(&$this, '_hashHTMLBlocks_callback'), $text); # - # Now match more liberally, simply from `\n` to `\n` + # Match from `\n` to `\n`, handling nested tags in between. # - $text = preg_replace_callback("{ + $text = preg_replace_callback('{ ( # save in $1 ^ # start of line (with /m) - <($block_tags_b) # start tag = $2 - \\b # word break - (.*\\n)*? # any number of lines, minimally matching - .* # the matching end tag - [ \\t]* # trailing spaces/tabs - (?=\\n+|\\Z) # followed by a newline or end of document + <('.$block_tags_b.')# start tag = $2 + '.$attr.'> # attributes followed by > + '.$content.' # content, support nesting + # the matching end tag + [ \t]* # trailing spaces/tabs + (?=\n+|\Z) # followed by a newline or end of document ) - }xm", + }xm', array(&$this, '_hashHTMLBlocks_callback'), $text); @@ -387,117 +437,205 @@ class Markdown_Parser { # Special case for standalone HTML comments: $text = preg_replace_callback('{ - (?: - (?<=\n\n) # Starting after a blank line - | # or - \A\n? # the beginning of the doc + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in $1 + [ ]{0,'.$less_than_tab.'} + (?s: + ) - ( # save in $1 - [ ]{0,'.$less_than_tab.'} - (?s: - - ) - [ \t]* - (?=\n{2,}|\Z) # followed by a blank line or end of document + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + }x', + array(&$this, '_hashHTMLBlocks_callback'), + $text); + + # PHP and ASP-style processor instructions () + $text = preg_replace_callback('{ + (?: + (?<=\n\n) # Starting after a blank line + | # or + \A\n? # the beginning of the doc + ) + ( # save in $1 + [ ]{0,'.$less_than_tab.'} + (?s: + <([?%]) # $2 + .*? + \2> ) - }x', - array(&$this, '_hashHTMLBlocks_callback'), - $text); + [ \t]* + (?=\n{2,}|\Z) # followed by a blank line or end of document + ) + }x', + array(&$this, '_hashHTMLBlocks_callback'), + $text); return $text; } function _hashHTMLBlocks_callback($matches) { $text = $matches[1]; - $key = md5($text); - $this->html_blocks[$key] = $text; - return "\n\n$key\n\n"; # String that will replace the block + $key = $this->hashBlock($text); + return "\n\n$key\n\n"; } - function runBlockGamut($text) { + function hashBlock($text) { + # + # Called whenever a tag must be hashed when a function insert a block-level + # tag in $text, it pass through this function and is automaticaly escaped, + # which remove the need to call _HashHTMLBlocks at every step. + # + # Swap back any tag hash found in $text so we do not have to `unhash` + # multiple times at the end. + $text = $this->unhash($text); + + # Then hash the block. + $key = md5($text); + $this->html_hashes[$key] = $text; + $this->html_blocks[$key] = $text; + return $key; # String that will replace the tag. + } + + + function hashSpan($text) { + # + # Called whenever a tag must be hashed when a function insert a span-level + # element in $text, it pass through this function and is automaticaly + # escaped, blocking invalid nested overlap. + # + # Swap back any tag hash found in $text so we do not have to `unhash` + # multiple times at the end. + $text = $this->unhash($text); + + # Then hash the span. + $key = md5($text); + $this->html_hashes[$key] = $text; + return $key; # String that will replace the span tag. + } + + + var $block_gamut = array( # # These are all the transformations that form block-level # tags like paragraphs, headers, and list items. # - $text = $this->doHeaders($text); + "doHeaders" => 10, + "doHorizontalRules" => 20, + + "doLists" => 40, + "doCodeBlocks" => 50, + "doBlockQuotes" => 60, + ); - # Do Horizontal Rules: - $text = preg_replace( - array('{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}mx', - '{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}mx', - '{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}mx'), - "\nempty_element_suffix\n", - $text); - - $text = $this->doLists($text); - $text = $this->doCodeBlocks($text); - $text = $this->doBlockQuotes($text); - - # We already ran _HashHTMLBlocks() before, in Markdown(), but that - # was to escape raw HTML in the original Markdown source. This time, - # we're escaping the markup we've just created, so that we don't wrap - #

tags around block-level tags. + function runBlockGamut($text) { + # + # Run block gamut tranformations. + # + # We need to escape raw HTML in Markdown source before doing anything + # else. This need to be done for each block, and not only at the + # begining in the Markdown function since hashed blocks can be part of + # list items and could have been indented. Indented blocks would have + # been seen as a code block in a previous pass of hashHTMLBlocks. $text = $this->hashHTMLBlocks($text); + + return $this->runBasicBlockGamut($text); + } + + function runBasicBlockGamut($text) { + # + # Run block gamut tranformations, without hashing HTML blocks. This is + # useful when HTML blocks are known to be already hashed, like in the first + # whole-document pass. + # + foreach ($this->block_gamut as $method => $priority) { + $text = $this->$method($text); + } + + # Finally form paragraph and restore hashed blocks. $text = $this->formParagraphs($text); return $text; } + + + function doHorizontalRules($text) { + # Do Horizontal Rules: + return preg_replace( + array('{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}mx', + '{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}mx', + '{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}mx'), + "\n".$this->hashBlock("empty_element_suffix")."\n", + $text); + } - function runSpanGamut($text) { + var $span_gamut = array( # # These are all the transformations that occur *within* block-level # tags like paragraphs, headers, and list items. # - $text = $this->doCodeSpans($text); - - $text = $this->escapeSpecialChars($text); + "escapeSpecialCharsWithinTagAttributes" => -20, + "doCodeSpans" => -10, + "encodeBackslashEscapes" => -5, # Process anchor and image tags. Images must come first, # because ![foo][f] looks like an anchor. - $text = $this->doImages($text); - $text = $this->doAnchors($text); - + "doImages" => 10, + "doAnchors" => 20, + # Make links out of things like `` - # Must come after _DoAnchors(), because you can use < and > + # Must come after doAnchors, because you can use < and > # delimiters in inline links like [this](). - $text = $this->doAutoLinks($text); - $text = $this->encodeAmpsAndAngles($text); - $text = $this->doItalicsAndBold($text); + "doAutoLinks" => 30, + "encodeAmpsAndAngles" => 40, - # Do hard breaks: - $text = preg_replace('/ {2,}\n/', "empty_element_suffix\n", $text); + "doItalicsAndBold" => 50, + "doHardBreaks" => 60, + ); + + function runSpanGamut($text) { + # + # Run span gamut tranformations. + # + foreach ($this->span_gamut as $method => $priority) { + $text = $this->$method($text); + } return $text; } + + + function doHardBreaks($text) { + # Do hard breaks: + $br_tag = $this->hashSpan("empty_element_suffix\n"); + return preg_replace('/ {2,}\n/', $br_tag, $text); + } - function escapeSpecialChars($text) { + function escapeSpecialCharsWithinTagAttributes($text) { + # + # Within tags -- meaning between < and > -- encode [\ ` * _] so they + # don't conflict with their use in Markdown for code, italics and strong. + # We're replacing each such character with its corresponding MD5 checksum + # value; this is likely overkill, but it should prevent us from colliding + # with the escape values by accident. + # $tokens = $this->tokenizeHTML($text); - $text = ''; # rebuild $text from the tokens - # $in_pre = 0; # Keep track of when we're inside

 or  tags.
-	#	$tags_to_skip = "!<(/?)(?:pre|code|kbd|script|math)[\s>]!";
 
 		foreach ($tokens as $cur_token) {
 			if ($cur_token[0] == 'tag') {
-				# Within tags, encode * and _ so they don't conflict
-				# with their use in Markdown for italics and strong.
-				# We're replacing each such character with its
-				# corresponding MD5 checksum value; this is likely
-				# overkill, but it should prevent us from colliding
-				# with the escape values by accident.
-				$cur_token[1] = str_replace(array('*', '_'),
-					array($this->escape_table['*'], $this->escape_table['_']),
-					$cur_token[1]);
-				$text .= $cur_token[1];
-			} else {
-				$t = $cur_token[1];
-				$t = $this->encodeBackslashEscapes($t);
-				$text .= $t;
+				$cur_token[1] = str_replace('\\', $this->escape_table['\\'], $cur_token[1]);
+				$cur_token[1] = str_replace(array('`'), $this->escape_table['`'], $cur_token[1]);
+				$cur_token[1] = str_replace('*', $this->escape_table['*'], $cur_token[1]);
+				$cur_token[1] = str_replace('_', $this->escape_table['_'], $cur_token[1]);
 			}
+			$text .= $cur_token[1];
 		}
 		return $text;
 	}
@@ -510,70 +648,89 @@ class Markdown_Parser {
 		#
 		# First, handle reference-style links: [link text] [id]
 		#
-		$text = preg_replace_callback("{
+		$text = preg_replace_callback('{
 			(					# wrap whole match in $1
-			  \\[
-				($this->nested_brackets)	# link text = $2
-			  \\]
+			  \[
+				('.$this->nested_brackets.')	# link text = $2
+			  \]
 
 			  [ ]?				# one optional space
-			  (?:\\n[ ]*)?		# one optional newline followed by spaces
+			  (?:\n[ ]*)?		# one optional newline followed by spaces
 
-			  \\[
+			  \[
 				(.*?)		# id = $3
-			  \\]
+			  \]
 			)
-			}xs",
+			}xs',
 			array(&$this, '_doAnchors_reference_callback'), $text);
 
 		#
 		# Next, inline-style links: [link text](url "optional title")
 		#
-		$text = preg_replace_callback("{
+		$text = preg_replace_callback('{
 			(				# wrap whole match in $1
-			  \\[
-				($this->nested_brackets)	# link text = $2
-			  \\]
-			  \\(			# literal paren
-				[ \\t]*
+			  \[
+				('.$this->nested_brackets.')	# link text = $2
+			  \]
+			  \(			# literal paren
+				[ \t]*
 				?	# href = $3
-				[ \\t]*
+				[ \t]*
 				(			# $4
-				  (['\"])	# quote char = $5
+				  ([\'"])	# quote char = $5
 				  (.*?)		# Title = $6
-				  \\5		# matching quote
+				  \5		# matching quote
+				  [ \t]*	# ignore any spaces/tabs between closing quote and )
 				)?			# title is optional
-			  \\)
+			  \)
 			)
-			}xs",
+			}xs',
 			array(&$this, '_DoAnchors_inline_callback'), $text);
 
+		#
+		# Last, handle reference-style shortcuts: [link text]
+		# These must come last in case you've also got [link test][1]
+		# or [link test](/foo)
+		#
+		$text = preg_replace_callback('{
+			(					# wrap whole match in $1
+			  \[
+				([^\[\]]+)		# link text = $2; can\'t contain [ or ]
+			  \]
+			)
+			}xs',
+			array(&$this, '_doAnchors_reference_callback'), $text);
+
 		return $text;
 	}
 	function _doAnchors_reference_callback($matches) {
-		$whole_match = $matches[1];
-		$link_text   = $matches[2];
-		$link_id     = strtolower($matches[3]);
+		$whole_match =  $matches[1];
+		$link_text   =  $matches[2];
+		$link_id     =& $matches[3];
 
 		if ($link_id == "") {
-			$link_id = strtolower($link_text); # for shortcut links like [this][].
+			# for shortcut links like [this][] or [this].
+			$link_id = $link_text;
 		}
+		
+		# lower-case and turn embedded newlines into spaces
+		$link_id = strtolower($link_id);
+		$link_id = preg_replace('{[ ]?\n}', ' ', $link_id);
 
 		if (isset($this->urls[$link_id])) {
 			$url = $this->urls[$link_id];
-			# We've got to encode these to avoid conflicting with italics/bold.
-			$url = str_replace(array('*', '_'),
-				array($this->escape_table['*'], $this->escape_table['_']),
-				$url);
+			$url = $this->encodeAmpsAndAngles($url);
+			
 			$result = "titles[$link_id] ) ) {
 				$title = $this->titles[$link_id];
-				$title = str_replace(array('*',     '_'),
-									 array($this->escape_table['*'], 
-										   $this->escape_table['_']), $title);
+				$title = $this->encodeAmpsAndAngles($title);
 				$result .=  " title=\"$title\"";
 			}
+		
+			$link_text = $this->runSpanGamut($link_text);
 			$result .= ">$link_text";
+			$result = $this->hashSpan($result);
 		}
 		else {
 			$result = $whole_match;
@@ -581,27 +738,24 @@ class Markdown_Parser {
 		return $result;
 	}
 	function _doAnchors_inline_callback($matches) {
-		$whole_match	= $matches[1];
-		$link_text		= $matches[2];
-		$url			= $matches[3];
+		$whole_match	=  $matches[1];
+		$link_text		=  $this->runSpanGamut($matches[2]);
+		$url			=  $matches[3];
 		$title			=& $matches[6];
+		
+		$url = $this->encodeAmpsAndAngles($url);
 
-		# We've got to encode these to avoid conflicting with italics/bold.
-		$url = str_replace(array('*', '_'),
-						   array($this->escape_table['*'], $this->escape_table['_']), 
-						   $url);
 		$result = "escape_table['*'], $this->escape_table['_']),
-								 $title);
+			$title = $this->encodeAmpsAndAngles($title);
 			$result .=  " title=\"$title\"";
 		}
 		
+		$link_text = $this->runSpanGamut($link_text);
 		$result .= ">$link_text";
 
-		return $result;
+		return $this->hashSpan($result);
 	}
 
 
@@ -632,12 +786,13 @@ class Markdown_Parser {
 		#
 		# Next, handle inline images:  ![alt text](url "optional title")
 		# Don't forget: encode * and _
-
+		#
 		$text = preg_replace_callback('{
 			(				# wrap whole match in $1
 			  !\[
 				('.$this->nested_brackets.')		# alt text = $2
 			  \]
+			  \s?			# One optional whitespace character
 			  \(			# literal paren
 				[ \t]*
 				?	# src url = $3
@@ -667,19 +822,13 @@ class Markdown_Parser {
 		$alt_text = str_replace('"', '"', $alt_text);
 		if (isset($this->urls[$link_id])) {
 			$url = $this->urls[$link_id];
-			# We've got to encode these to avoid conflicting with italics/bold.
-			$url = str_replace(array('*', '_'),
-							   array($this->escape_table['*'], $this->escape_table['_']),
-							   $url);
 			$result = "\"$alt_text\"";titles[$link_id])) {
 				$title = $this->titles[$link_id];
-				$title = str_replace(array('*', '_'),
-									 array($this->escape_table['*'], 
-										   $this->escape_table['_']), $title);
 				$result .=  " title=\"$title\"";
 			}
 			$result .= $this->empty_element_suffix;
+			$result = $this->hashSpan($result);
 		}
 		else {
 			# If there's no such link ID, leave intact:
@@ -699,20 +848,13 @@ class Markdown_Parser {
 
 		$alt_text = str_replace('"', '"', $alt_text);
 		$title    = str_replace('"', '"', $title);
-		# We've got to encode these to avoid conflicting with italics/bold.
-		$url = str_replace(array('*', '_'),
-						   array($this->escape_table['*'], $this->escape_table['_']),
-						   $url);
 		$result = "\"$alt_text\"";escape_table['*'], $this->escape_table['_']),
-								 $title);
 			$result .=  " title=\"$title\""; # $title already quoted
 		}
 		$result .= $this->empty_element_suffix;
 
-		return $result;
+		return $this->hashSpan($result);
 	}
 
 
@@ -749,14 +891,14 @@ class Markdown_Parser {
 		return $text;
 	}
 	function _doHeaders_callback_setext_h1($matches) {
-		return "

".$this->runSpanGamut($matches[1])."

\n\n"; + return $this->hashBlock("

".$this->runSpanGamut($matches[1])."

")."\n\n"; } function _doHeaders_callback_setext_h2($matches) { - return "

".$this->runSpanGamut($matches[1])."

\n\n"; + return $this->hashBlock("

".$this->runSpanGamut($matches[1])."

")."\n\n"; } function _doHeaders_callback_atx($matches) { $level = strlen($matches[1]); - return "".$this->runSpanGamut($matches[2])."\n\n"; + return $this->hashBlock("".$this->runSpanGamut($matches[2])."")."\n\n"; } @@ -804,20 +946,20 @@ class Markdown_Parser { ^ '.$whole_list.' }mx', - array(&$this, '_doLists_callback_top'), $text); + array(&$this, '_doLists_callback'), $text); } else { $text = preg_replace_callback('{ - (?:(?<=\n\n)|\A\n?) + (?:(?<=\n)\n|\A\n?) # Must eat the newline '.$whole_list.' }mx', - array(&$this, '_doLists_callback_nested'), $text); + array(&$this, '_doLists_callback'), $text); } } return $text; } - function _doLists_callback_top($matches) { + function _doLists_callback($matches) { # Re-usable patterns to match list item bullets and number markers: $marker_ul = '[*+-]'; $marker_ol = '\d+[.]'; @@ -833,31 +975,8 @@ class Markdown_Parser { $list = preg_replace("/\n{2,}/", "\n\n\n", $list); $result = $this->processListItems($list, $marker_any); - # Trim any trailing whitespace, to put the closing `` - # up on the preceding line, to get it past the current stupid - # HTML block parser. This is a hack to work around the terrible - # hack that is the HTML block parser. - $result = rtrim($result); - $result = "<$list_type>" . $result . "\n"; - return $result; - } - function _doLists_callback_nested($matches) { - # Re-usable patterns to match list item bullets and number markers: - $marker_ul = '[*+-]'; - $marker_ol = '\d+[.]'; - $marker_any = "(?:$marker_ul|$marker_ol)"; - - $list = $matches[1]; - $list_type = preg_match("/$marker_ul/", $matches[3]) ? "ul" : "ol"; - - $marker_any = ( $list_type == "ul" ? $marker_ul : $marker_ol ); - - # Turn double returns into triple returns, so that we can make a - # paragraph for the last item in a list, if necessary: - $list = preg_replace("/\n{2,}/", "\n\n\n", $list); - $result = $this->processListItems($list, $marker_any); - $result = "<$list_type>\n" . $result . "\n"; - return $result; + $result = $this->hashBlock("<$list_type>\n" . $result . ""); + return "\n". $result ."\n\n"; } var $list_level = 0; @@ -949,9 +1068,9 @@ class Markdown_Parser { $codeblock = $this->encodeCode($this->outdent($codeblock)); // $codeblock = $this->detab($codeblock); # trim leading newlines and trailing whitespace - $codeblock = preg_replace(array('/\A\n+/', '/\s+\z/'), '', $codeblock); + $codeblock = preg_replace(array('/\A\n+/', '/\n+\z/'), '', $codeblock); - $result = "\n\n
" . $codeblock . "\n
\n\n"; + $result = "\n\n".$this->hashBlock("
" . $codeblock . "\n
")."\n\n"; return $result; } @@ -999,7 +1118,7 @@ class Markdown_Parser { $c = preg_replace('/^[ \t]*/', '', $c); # leading whitespace $c = preg_replace('/[ \t]*$/', '', $c); # trailing whitespace $c = $this->encodeCode($c); - return "$c"; + return $this->hashSpan("$c"); } @@ -1018,8 +1137,8 @@ class Markdown_Parser { array('<', '>'), $_); # Now, escape characters that are magic in Markdown: - $_ = str_replace(array_keys($this->escape_table), - array_values($this->escape_table), $_); +// $_ = str_replace(array_keys($this->escape_table), +// array_values($this->escape_table), $_); return $_; } @@ -1027,7 +1146,7 @@ class Markdown_Parser { function doItalicsAndBold($text) { # must go first: - $text = preg_replace('{ + $text = preg_replace_callback('{ ( # $1: Marker (?\2', $text); + array(&$this, '_doItalicAndBold_strong_callback'), $text); # Then : - $text = preg_replace( + $text = preg_replace_callback( '{ ( (?\2', $text); + array(&$this, '_doItalicAndBold_em_callback'), $text); return $text; } + function _doItalicAndBold_em_callback($matches) { + $text = $matches[2]; + $text = $this->runSpanGamut($text); + return $this->hashSpan("$text"); + } + function _doItalicAndBold_strong_callback($matches) { + $text = $matches[2]; + $text = $this->runSpanGamut($text); + return $this->hashSpan("$text"); + } function doBlockQuotes($text) { @@ -1082,7 +1211,7 @@ class Markdown_Parser { $bq = preg_replace_callback('{(\s*
.+?
)}sx', array(&$this, '_DoBlockQuotes_callback2'), $bq); - return "
\n$bq\n
\n\n"; + return $this->hashBlock("
\n$bq\n
")."\n\n"; } function _doBlockQuotes_callback2($matches) { $pre = $matches[1]; @@ -1107,18 +1236,62 @@ class Markdown_Parser { foreach ($grafs as $key => $value) { if (!isset( $this->html_blocks[$value] )) { $value = $this->runSpanGamut($value); - $value = preg_replace('/^([ \t]*)/', '

', $value); + $value = preg_replace('/^([ \t]*)/', "

", $value); $value .= "

"; - $grafs[$key] = $value; + $grafs[$key] = $this->unhash($value); } } # # Unhashify HTML blocks # - foreach ($grafs as $key => $value) { - if (isset( $this->html_blocks[$value] )) { - $grafs[$key] = $this->html_blocks[$value]; +// foreach ($grafs as $key => $value) { +// if (isset( $this->html_blocks[$value] )) { +// $grafs[$key] = $this->html_blocks[$value]; +// } +// } + + foreach ($grafs as $key => $graf) { + # Modify elements of @grafs in-place... + if (isset($this->html_blocks[$graf])) { + $block = $this->html_blocks[$graf]; + $graf = $block; + if (preg_match('{ + \A + ( # $1 =
tag +
]* + \b + markdown\s*=\s* ([\'"]) # $2 = attr quote char + 1 + \2 + [^>]* + > + ) + ( # $3 = contents + .* + ) + (
) # $4 = closing tag + \z + }xs', $block, $matches)) + { + list(, $div_open, , $div_content, $div_close) = $matches; + + # We can't call Markdown(), because that resets the hash; + # that initialization code should be pulled into its own sub, though. + $div_content = $this->hashHTMLBlocks($div_content); + + # Run document gamut methods on the content. + foreach ($this->document_gamut as $method => $priority) { + $div_content = $this->$method($div_content); + } + + $div_open = preg_replace( + '{\smarkdown\s*=\s*([\'"]).+?\1}', '', $div_open); + + $graf = $div_open . "\n" . $div_content . "\n" . $div_close; + } + $grafs[$key] = $graf; } } @@ -1154,7 +1327,7 @@ class Markdown_Parser { function doAutoLinks($text) { - $text = preg_replace("!<((https?|ftp):[^'\">\\s]+)>!", + $text = preg_replace('{<((https?|ftp|dict):[^\'">\s]+)>}', '\1', $text); # Email addresses: @@ -1176,7 +1349,7 @@ class Markdown_Parser { $address = $matches[1]; $address = $this->unescapeSpecialChars($address); $address = $this->encodeEmailAddress($address); - return $address; + return $this->hashSpan($address); } @@ -1237,29 +1410,61 @@ class Markdown_Parser { # run of text between tags. Each element of the array is a # two-element array; the first is either 'tag' or 'text'; # the second is the actual value. + # Note: Takes code spans into account and does not generate tag + # tokens inside code spans. # - # - # Regular expression derived from the _tokenize() subroutine in - # Brad Choate's MTRegex plugin. - # - # - $index = 0; $tokens = array(); - $match = '(?s:)|'. # comment - '(?s:<\?.*?\?>)|'. # processing instruction - # regular tags - '(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)'; - - $parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE); - - foreach ($parts as $part) { - if (++$index % 2 && $part != '') - $tokens[] = array('text', $part); - else - $tokens[] = array('tag', $part); + while ($str != "") { + # + # + # + $parts = preg_split('{ + ( + (? # comment + | + <\?.*?\?> | <%.*?%> # processing instruction + | + <[/!$]?[-a-zA-Z0-9:]+ # regular tags + (?: + \s + (?>[^"\'>]+|"[^"]*"|\'[^\']*\')* + )? + > + ) + }xs', $str, 2, PREG_SPLIT_DELIM_CAPTURE); + + # Create token from text preceding tag. + if ($parts[0] != "") { + $tokens[] = array('text', $parts[0]); + } + + # Check if we reach the end. + if (count($parts) < 3) { + break; + } + + # Create token from tag or code span. + if ($parts[1]{0} == "`") { + $tokens[] = array('text', $parts[1]); + $str = $parts[2]; + + # Skip the whole code span, pass as text token. + if (preg_match('/^(.*(?html_hashes), + array_values($this->html_hashes), $text); + } + } @@ -1344,6 +1558,69 @@ Version History See the readme file for detailed release notes for this version. +1.0.2b7 (16 Sep 2006) + +* Changed span and block gamut methods so that they loop over a + customizable list of methods. This makes subclassing the parser a more + interesting option for creating syntax extensions. + +* Also added a "document" gamut loop which can be used to hook document-level + methods (like for striping link definitions). + +* Changed all methods which were inserting HTML code so that they now return + a hashed representation of the code. New methods `hashSpan` and `hashBlock` + are used to hash respectivly span- and block-level generated content. This + has a couple of significant effects: + + 1. It prevents invalid nesting of Markdown-generated elements which + could occur occuring with constructs like `*something [link*][1]`. + 2. It prevents problems occuring with deeply nested lists on which + paragraphs were ill-formed. + 3. It removes the need to call `hashHTMLBlocks` twice during the the + block gamut. + + Hashes are turned back to HTML prior output. + +* Made the block-level HTML parser smarter using a specially-crafted regular + expression capable of handling nested tags. + +* Solved backtick issues in tag attributes by rewriting the HTML tokenizer to + be aware of code spans. All these lines should work correctly now: + + bar + bar + `` + +* `
` has been added to the list of block-level elements and is now + treated as an HTML block instead of being wrapped within paragraph tags. + +* Now only trim trailing newlines from code blocks, instead of trimming + all trailing whitespace characters. + +* Fixed bug where this: + + [text](http://m.com "title" ) + + wasn't working as expected, because the parser wasn't allowing for spaces + before the closing paren. + +* Filthy hack to support markdown='1' in div tags. + +* _DoAutoLinks() now supports the 'dict://' URL scheme. + +* PHP- and ASP-style processor instructions are now protected as + raw HTML blocks. + + + <% ... %> + +* Experimental support for [this] as a synonym for [this][]. + +* Fix for escaped backticks still triggering code spans: + + There are two raw backticks here: \` and here: \`, not a code span + + 1.0.1oo (19 May 2006) * Converted PHP Markdown to a object-oriented design.