html转markdown
<?php
/**
*
* @copyright GPL
* @version 1.0
*
*/
class Markdown{
function parseMarkdown($doc) {
//tab to space
$doc = str_replace("\t", str_repeat(' ', 4), $doc);
//\r to \n
$doc = "\n" . str_replace("\r", '', $doc) . "\n";
//remove empty line
$doc = preg_replace('#\n+#i', "\n", $doc);
//pre code preReplace, just replac to a tag to escape <p> tag replace
$preCodeTpl = "\n<preCode %s>"; //< for p skip
$preCodePattern = '#```([a-z]+)?(.*?)```#is';
if(preg_match_all($preCodePattern, $doc, $preCodes)) {
foreach($preCodes[0] as $key => $value) {
//every pre code into <preCode [index]> tag
$doc = preg_replace($preCodePattern, sprintf($preCodeTpl, $key), $doc, 1);
}
}
//space code preReplace, first one should have 8 space, and the next should only more then 4
$spaceCodePattern = '#\n[ ]{8}[^\n]*?(?=\n)(?:\n[ ]{4}[^\n]*?(?=\n))*#is';
$spaceCodeTpl = "\n<spaceCode %d>";
if(preg_match_all($spaceCodePattern, $doc, $spaceCodes)) {
foreach($spaceCodes[0] as $key => $value) {
$doc = preg_replace($spaceCodePattern, sprintf($spaceCodeTpl, $key), $doc, 1);
}
}
//blockquote preReplace
$blockquotePattern = '#(?:\n> \*.*?(?=\n))+#is';
$blockquoteTpl = "\n<blockquote %d>";
if(preg_match_all($blockquotePattern, $doc, $blockquotes)) {
foreach($blockquotes[0] as $key => $value) {
$doc = preg_replace($blockquotePattern, sprintf($blockquoteTpl, $key), $doc, 1);
}
}
//hr
$doc = preg_replace('#----*[ ]*(\n)#is', '<hr />\1', $doc);
//h1/h2/h3
$doc = preg_replace('/(\n)###(.*?)#*(?=\n)/is', '\1<h3>\2</h3>', $doc);
$doc = preg_replace('/(\n)##(.*?)#*(?=\n)/is', '\1<h2>\2</h2>', $doc);
$doc = preg_replace('/(\n)#(.*?)#*(?=\n)/is', '\1<h1>\2</h1>', $doc);
//strong
$doc = preg_replace('#\*\*([^\n]*?)\*\*#is', '<strong>\1</strong>', $doc);
//em
$doc = preg_replace('#\*([^\n]*?)\*#is', '<em>\1</em>', $doc);
//code
$doc = preg_replace('#`(.*?)`#is', '<code>\1</code>', $doc);
//ul li
$liPattern = '#(?:\n\* [^\n]*?(?=\n))+#is';
if(preg_match_all($liPattern, $doc, $lis)) {
foreach($lis[0] as $key => $value) {
$ul = '<ul>%s</ul>';
$lis = preg_replace('#(\n)\* ([^\n]*?)(?=\n)#is', '\1<li>\2</li>', "\n" . $value . "\n");
$ul = sprintf($ul, $lis);
$doc = preg_replace($liPattern, $ul, $doc, 1);
}
}
//img
$doc = preg_replace('#!\[([^\]]*?)\]\(([^\s]*?)(?: "([^"]*?)")?\)#is', '<img src="\2" alt="\1" title="\3" />', $doc);
//a simple
if(preg_match_all('#\[([^\]]*?)\]\(([^\s]*?)(?:\s"([^"]*?)")?\)#is', $doc, $links)) {
$linkTpl = '<a target="_blank" href="%s" title="%s">%s</a>';
foreach($links[0] as $key => $value) {
$doc = str_replace($links[0][$key], sprintf($linkTpl, $links[2][$key], $links[3][$key], $links[1][$key]), $doc);
}
}
//a normal
if(preg_match_all('#\[([^\]]*?)\]\[(\d+)\]#is', $doc, $links)) {
$linkTpl = '<a target="_blank" href="%s">%s</a>';
foreach($links[0] as $key => $value) {
if(preg_match(sprintf('#\n\[%d\]: (.*?)(?=\n)#is', $links[2][$key]), $doc, $linkHref)) {
$doc = str_replace($links[0][$key], sprintf($linkTpl, $linkHref[1], $links[1][$key]), $doc);
}
}
}
//remove all link href
$doc = preg_replace('#\n\[\d+\]:.*?(?=\n)#is', '', $doc);
//a footnote
if(preg_match_all('#\[\^(.*?)\]#is', $doc, $footnotes)) {
$footnoteTpl = '<a href="#fn:%s" id="fnref:%s" title="go to footnote" class="footnote">[%s]</a>';
$footnoteReplaced = array();
foreach($footnotes[0] as $key => $value) {
$footnoteId = $footnotes[1][$key];
if(isset($footnoteReplaced[$footnoteId])) continue;
$footnoteReplaced[$footnoteId] = true;
$index = $key+1;
$footnoteHash = sprintf('<span id="fn:%s">[%s] </span>', $footnoteId, $index);
$footnoteBack = sprintf('<a class="reversefootnote" title="go back to content" href="#fnref:%s"><-</a><br>', $footnoteId);
//match footnote by id
if(preg_match(sprintf('#(\n)\[\^%s\]: (.*?)(?=\n)#is', $footnoteId), $doc, $footnote)) {
//footnote link
$doc = preg_replace(sprintf('#\[\^%s\]#is', $footnoteId), sprintf($footnoteTpl, $footnoteId, $footnoteId, $index), $doc, 1);
//footnote desc
$doc = str_replace($footnote[0], '<-fs->' . $footnote[1]. $footnoteHash . $footnote[2] . $footnoteBack . '<-fe->', $doc);
}
}
}
//put it into footnotes div
$doc = preg_replace('#<-fs->(.+)<-fe->#is', '<div class="footnotes"><hr><small>\1</small></div>', $doc);
//remove the unnecessary tags
$doc = str_replace(array('<-fs->','<-fe->'), '', $doc);
//br
$doc = preg_replace('#[ ]{4}(?=\n)#is', '<br>', $doc);
//p not before other tag, and not after pre
$doc = preg_replace('#\n+#i', "\n", $doc); //remove empty line [important]
$doc = preg_replace('#(\n)([^<].*?)(?=\n)#i', '\1<p>\2</p>', $doc);
//pre code replace
if(!empty($preCodes[0])) {
foreach($preCodes[0] as $key => $value) {
$preCode = '<pre class="'.$preCodes[1][$key].'"><ol>%s</ol></pre>';
$lines = preg_replace('#(\n)(.*?)(?=\n)#is', '\1<li><code>\2</code></li>', htmlspecialchars($preCodes[2][$key]));
$preCode = sprintf($preCode, $lines);
$doc = str_replace(sprintf($preCodeTpl, $key), $preCode, $doc);
}
}
//space code
if(!empty($spaceCodes[0])) {
foreach($spaceCodes[0] as $key => $value) {
$spaceCode = sprintf('<pre><code>%s%s</code></pre>', htmlspecialchars($value), "\n");
$doc = str_replace(sprintf($spaceCodeTpl, $key), $spaceCode, $doc);
}
}
//blockquote replace
if(!empty($blockquotes[0])) {
foreach($blockquotes[0] as $key => $value) {
$blockquote = '<blockquote><ul>%s</ul></blockquote>';
$lis = preg_replace('#(\n)> \* (.*?)(?=\n)#is', '\1<li>\2</li>', $value . "\n");
$blockquote = sprintf($blockquote, $lis);
$doc = str_replace(sprintf($blockquoteTpl, $key), $blockquote, $doc);
}
}
return trim($doc, "\n");
}
function parseHtml($html) {
//\r to \n
$html = "\n" . str_replace("\r", '', $html) . "\n";
//h1/h2/h3
$html = preg_replace('#<h1[^>]*?>(.*?)</h1>#is', '#\1', $html);
$html = preg_replace('#<h2[^>]*?>(.*?)</h2>#is', '##\1', $html);
$html = preg_replace('#<h3[^>]*?>(.*?)</h3>#is', '###\1', $html);
//hr
$html = preg_replace('#<hr\s*/?>#is', '---', $html);
//em
$html = preg_replace('#<em[^>]*?>(.*?)</em>#is', '*\1*', $html);
//pre code
if(preg_match_all('#<pre><code[^>]*?>(.*?)</code></pre>#is', $html, $tabCodes)) {
foreach($tabCodes[0] as $key => $value) {
if(preg_match_all('#.*?\n#is', $tabCodes[1][$key], $lines)) {
$space8 = str_repeat(' ', 8);
$tabCodes[1][$key] = '';
foreach($lines[0] as $k => $v) {
$tabCodes[1][$key] .= $space8 . $v;
}
}
$html = str_replace($tabCodes[0][$key], $tabCodes[1][$key], $html);
}
}
//pre ol code
$liPattern = '#[ ]*<li>(.*?)</li>[ ]*#is';
$preOlPattern = '#<pre class="([^"]*?)"><ol>(.*?)</ol></pre>#is';
if(preg_match_all($preOlPattern, $html, $preOls)) {
foreach($preOls[0] as $key => $value) {
//li
if(preg_match_all($liPattern, $preOls[2][$key], $lis)) {
foreach($lis[0] as $k => $v) {
$index = $k + 1;
$html = str_replace($v, strip_tags($lis[1][$k]), $html);
}
}
$html = preg_replace($preOlPattern, sprintf('```%s\2```', $preOls[1][$key]), $html, 1);
}
}
//code
$html = preg_replace('#<code[^>]*?>(.*?)</code>#is', '`\1`', $html);
//strong
$html = preg_replace('#<strong[^>]*?>(.*?)</strong>#is', '**\1**', $html);
//img
$html = preg_replace('#<img src="([^"]*?)" alt="([^"]*?)" title="([^"]*?)"\s*/?>#is', '', $html);
$html = preg_replace('#(!\[.*?\]\(.*?) ""(\))#is', '\1\2', $html);
//a
$linkPattern = '#<a (?:target="_blank" )?href="([^"]*?)">(.*?)</a>#is';
if(preg_match_all($linkPattern, $html, $links)) {
$html .= "\n";
foreach($links[0] as $key => $value) {
$index = $key + 1;
$html = str_replace($value, sprintf('[%s][%d]', $links[2][$key], $index), $html);
$html .= sprintf("[%d]: %s\n", $index, $links[1][$key]);
}
}
//a 注脚
$footnotePattern = '#<a href="\#fn:([^"]*?)" id="fnref:\1" title="[^"]*?" class="footnote">\[(\d+)\]</a>#is';
if(preg_match_all($footnotePattern, $html, $footnotes)) {
foreach($footnotes[0] as $key => $value) {
$footnote = $footnotes[1][$key]; //code
$footnoteNum = $footnotes[2][$key]; //1
$footnoteStr = sprintf('[^%s]', $footnote);
//part 1
$html = str_replace($value, $footnoteStr, $html);
//part 2
$html = str_ireplace(sprintf('<span id="fn:%s">[%s] </span>', $footnote, $footnoteNum), $footnoteStr . ': ', $html);
//part 3
$html = preg_replace(sprintf('#<a href="\#fnref:%s" title="[^"]*?" class="reversefootnote">.*?</a>#is', $footnote), '', $html);
}
}
$html = preg_replace('#<div class="footnotes">\s*---\s*<small>(.*?)</small>\s*</div>#is', '\1', $html);
$html = preg_replace('#<a class="reversefootnote".*?><-</a><br>#is', '', $html);
//blockquote
$blockPattern = '#<blockquote[^>]*?>\s*<ul>(.*?)</ul>\s*</blockquote>#is';
if(preg_match_all($blockPattern, $html, $blocks)) {
foreach($blocks[0] as $key => $value) { //Every blockquote
//li
if(preg_match_all($liPattern, $blocks[1][$key], $lis)) {
foreach($lis[0] as $k => $v) {
$html = str_replace($v, '> * ' . $lis[1][$k], $html);
}
}
}
}
//All blockquote html
$html = preg_replace($blockPattern, '\1', $html);
//ul/li
$ulPattern = '#<ul>(.*?)</ul>#is';
if(preg_match_all($ulPattern, $html, $uls)) {
foreach($uls[0] as $key => $value) { //Every blockquote
//li
if(preg_match_all($liPattern, $uls[1][$key], $lis)) {
foreach($lis[0] as $k => $v) {
$html = str_replace($v, '* ' . $lis[1][$k], $html);
}
}
}
}
//All ul html
$html = preg_replace($ulPattern, '\1', $html);
//ol/li
$olPattern = '#<ol>(.*?)</ol>#is';
if(preg_match_all($olPattern, $html, $ols)) {
foreach($ols[0] as $key => $value) {
//li
if(preg_match_all($liPattern, $ols[1][$key], $lis)) {
foreach($lis[0] as $k => $v) {
$index = $k + 1;
$html = str_replace($v, sprintf('%d. %s', $index, $lis[1][$k]), $html);
}
}
}
}
//All ol html
$html = preg_replace($olPattern, '\1', $html);
//br
$html = preg_replace('#<br\s*/?>#is', str_repeat(' ', 4), $html);
//p not before pre
$html = preg_replace('#<p[^>]*?>(.*?)</p>#is', '\1', $html);
return $html;
}
}
使用方法
$obj = new Markdown();
$html = $obj->parseHtml($_POST['content']); //parseMarkdown