⬆️ 🔨 Upgrade Markdownify library.

The current version 2.0.0 (alpha) throws deprecated warning with
PHP7.1 and PHPUnit.
Upgrade the HTML to Markdown converter for PHP to the current
Markdownify 2.2.1.
Used composer to manage this library.
This commit is contained in:
Klaus Weidenbach 2017-03-02 23:25:04 +01:00
parent 8e1716065e
commit 6c79e0c077
20 changed files with 2869 additions and 2427 deletions

View File

@ -29,7 +29,8 @@
"ext-xml" : "*",
"ext-openssl" : "*",
"sabre/dav" : "~3.2",
"michelf/php-markdown" : "^1.7"
"michelf/php-markdown" : "^1.7",
"pixel418/markdownify": "^2.2"
},
"require-dev" : {
"php" : ">=5.6",

58
composer.lock generated
View File

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#composer-lock-the-lock-file",
"This file is @generated automatically"
],
"content-hash": "4b24468c1f8babe7c8804fba8ee602f7",
"content-hash": "c0cafbf9fd702be588f6b392b9742cb6",
"packages": [
{
"name": "michelf/php-markdown",
@ -57,6 +57,62 @@
],
"time": "2016-10-29T18:58:20+00:00"
},
{
"name": "pixel418/markdownify",
"version": "v2.2.1",
"source": {
"type": "git",
"url": "https://github.com/Elephant418/Markdownify.git",
"reference": "0160677f04c784550dd10fd72fdf3994967db848"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Elephant418/Markdownify/zipball/0160677f04c784550dd10fd72fdf3994967db848",
"reference": "0160677f04c784550dd10fd72fdf3994967db848",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8"
},
"type": "lib",
"autoload": {
"psr-4": {
"Markdownify\\": "src",
"Test\\Markdownify\\": "test"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"LGPL"
],
"authors": [
{
"name": "Peter Kruithof",
"email": "pkruithof@gmail.com",
"homepage": "http://pkruithof.tumblr.com/"
},
{
"name": "Milian Wolff",
"email": "mail@milianw.de",
"homepage": "http://milianw.de"
},
{
"name": "Thomas Zilliox",
"email": "hello@tzi.fr",
"homepage": "http://tzi.fr"
}
],
"description": "The HTML to Markdown converter for PHP ",
"homepage": "https://github.com/elephant418/Markdownify",
"keywords": [
"markdown",
"markdownify"
],
"time": "2016-09-21T13:01:43+00:00"
},
{
"name": "psr/log",
"version": "1.0.2",

View File

@ -5,12 +5,12 @@
*/
use Michelf\MarkdownExtra;
use Markdownify\Converter;
require_once("include/oembed.php");
require_once("include/event.php");
require_once("include/html2bbcode.php");
require_once("include/bbcode.php");
require_once("library/markdownify/markdownify.php");
function get_bb_tag_pos($s, $name, $occurance = 1) {
@ -367,7 +367,6 @@ function bb2diaspora_itemwallwall(&$item,$uplink = false) {
function bb2diaspora_itembody($item, $force_update = false, $have_channel = false, $uplink = false) {
if(! get_iconfig($item,'diaspora','fields')) {
$force_update = true;
}
@ -454,7 +453,7 @@ function bb2diaspora_itembody($item, $force_update = false, $have_channel = fals
return html_entity_decode($body);
}
function bb2diaspora($Text,$preserve_nl = false, $fordiaspora = true) {
function bb2diaspora($Text, $preserve_nl = false, $fordiaspora = true) {
// Re-enabling the converter again.
// The bbcode parser now handles youtube-links (and the other stuff) correctly.
@ -496,11 +495,10 @@ function bb2diaspora($Text,$preserve_nl = false, $fordiaspora = true) {
$Text = str_replace(array('<','>','&'),array('&_lt_;','&_gt_;','&_amp_;'),$Text);
// Now convert HTML to Markdown
$md = new Markdownify(false, false, false);
$md = new Converter(Converter::LINK_AFTER_CONTENT, false, false);
$Text = $md->parseString($Text);
// It also adds backslashes to our attempt at getting around the html entity preservation for some weird reason.
$Text = str_replace(array('&\\_lt\\_;','&\\_gt\\_;','&\\_amp\\_;'),array('<','>','&'),$Text);
@ -522,7 +520,7 @@ function bb2diaspora($Text,$preserve_nl = false, $fordiaspora = true) {
$Text = trim($Text);
call_hooks('bb2diaspora',$Text);
call_hooks('bb2diaspora', $Text);
return $Text;
}

View File

@ -1,29 +0,0 @@
Markdownify
===========
* handle non-markdownifiable lists (i.e. `<ul><li id="foobar">asdf</li></ul>`)
* organize methods better (i.e. flushlinebreaks & setlinebreaks close to each other)
* take a look at function names etc.
* is the new (in rev. 93) lastclosedtag property needed?
* word wrapping (some work is done but it's still very buggy)
Markdownify Extra
=================
* handle table alignment with KEEP_HTML=false
* handle tables without headings when KEEP_HTML=false is set
* handle Markdown inside non-markdownable tags
Implementation Thoughts
=======================
* non-markdownifiable lists and markdown inside non-markdownable tags as well as the current
table implementation could be rewritten by using a rollback mechanism.
example:
<ul><li>asdf</li><li id="foobar">asdf</li></ul>
we come to `<ul>`, know that this might fail and create a snapshot of our current parser
we keep on parsing and when we reach `<li id="foobar">` we gotta rollback and keep this
list in HTML format.

View File

@ -1,51 +0,0 @@
<?php
error_reporting(E_ALL);
if (!empty($_POST['input'])) {
include 'markdownify_extra.php';
if (!isset($_POST['leap'])) {
$leap = MDFY_LINKS_EACH_PARAGRAPH;
} else {
$leap = $_POST['leap'];
}
if (!isset($_POST['keepHTML'])) {
$keephtml = MDFY_KEEPHTML;
} else {
$keephtml = $_POST['keepHTML'];
}
if (!empty($_POST['extra'])) {
$md = new Markdownify_Extra($leap, MDFY_BODYWIDTH, $keephtml);
} else {
$md = new Markdownify($leap, MDFY_BODYWIDTH, $keephtml);
}
if (ini_get('magic_quotes_gpc')) {
$_POST['input'] = stripslashes($_POST['input']);
}
$output = $md->parseString($_POST['input']);
} else {
$_POST['input'] = '';
}
?><!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en-US" lang="en-US">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<title>HTML to Markdown Converter</title>
</head>
<body>
<?php if (empty($_POST['input'])): ?>
<form action="<?php echo $_SERVER['PHP_SELF']; ?>" method="post">
<fieldset>
<legend>HTML Input</legend>
<textarea style="width:100%;" cols="85" rows="40" name="input"><?php echo htmlspecialchars($_POST['input'], ENT_NOQUOTES, 'UTF-8'); ?></textarea>
</fieldset>
<label for="extra">Markdownify Extra: <input name="extra" checked="checked" id="extra" type="checkbox" value="1" /></label>
<label for="leap">Links after each block elem: <input name="leap" id="leap" type="checkbox" value="1" /></label>
<label for="keepHTML">keep HTML: <input name="keepHTML" id="keepHTML" type="checkbox" value="1" checked="checked" /></label>
<input type="submit" name="submit" value="submit" />
</form>
<?php else: ?>
<h1 style="text-align:right;"><a href="<?php echo $_SERVER['PHP_SELF']; ?>">BACK</a></h1>
<pre><?php echo htmlspecialchars($output, ENT_NOQUOTES, 'UTF-8'); ?></pre>
<?php endif; ?>
</body>
</html>

File diff suppressed because it is too large Load Diff

View File

@ -1,33 +0,0 @@
#!/usr/bin/php
<?php
require dirname(__FILE__) .'/markdownify_extra.php';
function param($name, $default = false) {
if (!in_array('--'.$name, $_SERVER['argv']))
return $default;
reset($_SERVER['argv']);
while (each($_SERVER['argv'])) {
if (current($_SERVER['argv']) == '--'.$name)
break;
}
$value = next($_SERVER['argv']);
if ($value === false || substr($value, 0, 2) == '--')
return true;
else
return $value;
}
$input = stream_get_contents(STDIN);
$linksAfterEachParagraph = param('links');
$bodyWidth = param('width');
$keepHTML = param('html', true);
if (param('no_extra')) {
$parser = new Markdownify($linksAfterEachParagraph, $bodyWidth, $keepHTML);
} else {
$parser = new Markdownify_Extra($linksAfterEachParagraph, $bodyWidth, $keepHTML);
}
echo $parser->parseString($input) ."\n";

View File

@ -1,489 +0,0 @@
<?php
/**
* Class to convert HTML to Markdown with PHP Markdown Extra syntax support.
*
* @version 1.0.0 alpha
* @author Milian Wolff (<mail@milianw.de>, <http://milianw.de>)
* @license LGPL, see LICENSE_LGPL.txt and the summary below
* @copyright (C) 2007 Milian Wolff
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
/**
* standard Markdownify class
*/
require_once dirname(__FILE__).'/markdownify.php';
class Markdownify_Extra extends Markdownify {
/**
* table data, including rows with content and the maximum width of each col
*
* @var array
*/
var $table = array();
/**
* current col
*
* @var int
*/
var $col = -1;
/**
* current row
*
* @var int
*/
var $row = 0;
/**
* constructor, see Markdownify::Markdownify() for more information
*/
function Markdownify_Extra($linksAfterEachParagraph = MDFY_LINKS_EACH_PARAGRAPH, $bodyWidth = MDFY_BODYWIDTH, $keepHTML = MDFY_KEEPHTML) {
parent::Markdownify($linksAfterEachParagraph, $bodyWidth, $keepHTML);
### new markdownable tags & attributes
# header ids: # foo {bar}
$this->isMarkdownable['h1']['id'] = 'optional';
$this->isMarkdownable['h2']['id'] = 'optional';
$this->isMarkdownable['h3']['id'] = 'optional';
$this->isMarkdownable['h4']['id'] = 'optional';
$this->isMarkdownable['h5']['id'] = 'optional';
$this->isMarkdownable['h6']['id'] = 'optional';
# tables
$this->isMarkdownable['table'] = array();
$this->isMarkdownable['th'] = array(
'align' => 'optional',
);
$this->isMarkdownable['td'] = array(
'align' => 'optional',
);
$this->isMarkdownable['tr'] = array();
array_push($this->ignore, 'thead');
array_push($this->ignore, 'tbody');
array_push($this->ignore, 'tfoot');
# definition lists
$this->isMarkdownable['dl'] = array();
$this->isMarkdownable['dd'] = array();
$this->isMarkdownable['dt'] = array();
# footnotes
$this->isMarkdownable['fnref'] = array(
'target' => 'required',
);
$this->isMarkdownable['footnotes'] = array();
$this->isMarkdownable['fn'] = array(
'name' => 'required',
);
$this->parser->blockElements['fnref'] = false;
$this->parser->blockElements['fn'] = true;
$this->parser->blockElements['footnotes'] = true;
# abbr
$this->isMarkdownable['abbr'] = array(
'title' => 'required',
);
# build RegEx lookahead to decide wether table can pe parsed or not
$inlineTags = array_keys($this->parser->blockElements, false);
$colContents = '(?:[^<]|<(?:'.implode('|', $inlineTags).'|[^a-z]))+';
$this->tableLookaheadHeader = '{
^\s*(?:<thead\s*>)?\s* # open optional thead
<tr\s*>\s*(?: # start required row with headers
<th(?:\s+align=("|\')(?:left|center|right)\1)?\s*> # header with optional align
\s*'.$colContents.'\s* # contents
</th>\s* # close header
)+</tr> # close row with headers
\s*(?:</thead>)? # close optional thead
}sxi';
$this->tdSubstitute = '\s*'.$colContents.'\s* # contents
</td>\s*';
$this->tableLookaheadBody = '{
\s*(?:<tbody\s*>)?\s* # open optional tbody
(?:<tr\s*>\s* # start row
%s # cols to be substituted
</tr>)+ # close row
\s*(?:</tbody>)? # close optional tbody
\s*</table> # close table
}sxi';
}
/**
* handle header tags (<h1> - <h6>)
*
* @param int $level 1-6
* @return void
*/
function handleHeader($level) {
static $id = null;
if ($this->parser->isStartTag) {
if (isset($this->parser->tagAttributes['id'])) {
$id = $this->parser->tagAttributes['id'];
}
} else {
if (!is_null($id)) {
$this->out(' {#'.$id.'}');
$id = null;
}
}
parent::handleHeader($level);
}
/**
* handle <abbr> tags
*
* @param void
* @return void
*/
function handleTag_abbr() {
if ($this->parser->isStartTag) {
$this->stack();
$this->buffer();
} else {
$tag = $this->unstack();
$tag['text'] = $this->unbuffer();
$add = true;
foreach ($this->stack['abbr'] as $stacked) {
if ($stacked['text'] == $tag['text']) {
/** TODO: differing abbr definitions, i.e. different titles for same text **/
$add = false;
break;
}
}
$this->out($tag['text']);
if ($add) {
array_push($this->stack['abbr'], $tag);
}
}
}
/**
* flush stacked abbr tags
*
* @param void
* @return void
*/
function flushStacked_abbr() {
$out = array();
foreach ($this->stack['abbr'] as $k => $tag) {
if (!isset($tag['unstacked'])) {
array_push($out, ' *['.$tag['text'].']: '.$tag['title']);
$tag['unstacked'] = true;
$this->stack['abbr'][$k] = $tag;
}
}
if (!empty($out)) {
$this->out("\n\n".implode("\n", $out));
}
}
/**
* handle <table> tags
*
* @param void
* @return void
*/
function handleTag_table() {
if ($this->parser->isStartTag) {
# check if upcoming table can be converted
if ($this->keepHTML) {
if (preg_match($this->tableLookaheadHeader, $this->parser->html, $matches)) {
# header seems good, now check body
# get align & number of cols
preg_match_all('#<th(?:\s+align=("|\')(left|right|center)\1)?\s*>#si', $matches[0], $cols);
$regEx = '';
$i = 1;
$aligns = array();
foreach ($cols[2] as $align) {
$align = strtolower($align);
array_push($aligns, $align);
if (empty($align)) {
$align = 'left'; # default value
}
$td = '\s+align=("|\')'.$align.'\\'.$i;
$i++;
if ($align == 'left') {
# look for empty align or left
$td = '(?:'.$td.')?';
}
$td = '<td'.$td.'\s*>';
$regEx .= $td.$this->tdSubstitute;
}
$regEx = sprintf($this->tableLookaheadBody, $regEx);
if (preg_match($regEx, $this->parser->html, $matches, null, strlen($matches[0]))) {
# this is a markdownable table tag!
$this->table = array(
'rows' => array(),
'col_widths' => array(),
'aligns' => $aligns,
);
$this->row = 0;
} else {
# non markdownable table
$this->handleTagToText();
}
} else {
# non markdownable table
$this->handleTagToText();
}
} else {
$this->table = array(
'rows' => array(),
'col_widths' => array(),
'aligns' => array(),
);
$this->row = 0;
}
} else {
# finally build the table in Markdown Extra syntax
$separator = array();
# seperator with correct align identifikators
foreach($this->table['aligns'] as $col => $align) {
if (!$this->keepHTML && !isset($this->table['col_widths'][$col])) {
break;
}
$left = ' ';
$right = ' ';
switch ($align) {
case 'left':
$left = ':';
break;
case 'center':
$right = ':';
$left = ':';
case 'right':
$right = ':';
break;
}
array_push($separator, $left.str_repeat('-', $this->table['col_widths'][$col]).$right);
}
$separator = '|'.implode('|', $separator).'|';
$rows = array();
# add padding
array_walk_recursive($this->table['rows'], array(&$this, 'alignTdContent'));
$header = array_shift($this->table['rows']);
array_push($rows, '| '.implode(' | ', $header).' |');
array_push($rows, $separator);
foreach ($this->table['rows'] as $row) {
array_push($rows, '| '.implode(' | ', $row).' |');
}
$this->out(implode("\n".$this->indent, $rows));
$this->table = array();
$this->setLineBreaks(2);
}
}
/**
* properly pad content so it is aligned as whished
* should be used with array_walk_recursive on $this->table['rows']
*
* @param string &$content
* @param int $col
* @return void
*/
function alignTdContent(&$content, $col) {
switch ($this->table['aligns'][$col]) {
default:
case 'left':
$content .= str_repeat(' ', $this->table['col_widths'][$col] - $this->strlen($content));
break;
case 'right':
$content = str_repeat(' ', $this->table['col_widths'][$col] - $this->strlen($content)).$content;
break;
case 'center':
$paddingNeeded = $this->table['col_widths'][$col] - $this->strlen($content);
$left = floor($paddingNeeded / 2);
$right = $paddingNeeded - $left;
$content = str_repeat(' ', $left).$content.str_repeat(' ', $right);
break;
}
}
/**
* handle <tr> tags
*
* @param void
* @return void
*/
function handleTag_tr() {
if ($this->parser->isStartTag) {
$this->col = -1;
} else {
$this->row++;
}
}
/**
* handle <td> tags
*
* @param void
* @return void
*/
function handleTag_td() {
if ($this->parser->isStartTag) {
$this->col++;
if (!isset($this->table['col_widths'][$this->col])) {
$this->table['col_widths'][$this->col] = 0;
}
$this->buffer();
} else {
$buffer = trim($this->unbuffer());
$this->table['col_widths'][$this->col] = max($this->table['col_widths'][$this->col], $this->strlen($buffer));
$this->table['rows'][$this->row][$this->col] = $buffer;
}
}
/**
* handle <th> tags
*
* @param void
* @return void
*/
function handleTag_th() {
if (!$this->keepHTML && !isset($this->table['rows'][1]) && !isset($this->table['aligns'][$this->col+1])) {
if (isset($this->parser->tagAttributes['align'])) {
$this->table['aligns'][$this->col+1] = $this->parser->tagAttributes['align'];
} else {
$this->table['aligns'][$this->col+1] = '';
}
}
$this->handleTag_td();
}
/**
* handle <dl> tags
*
* @param void
* @return void
*/
function handleTag_dl() {
if (!$this->parser->isStartTag) {
$this->setLineBreaks(2);
}
}
/**
* handle <dt> tags
*
* @param void
* @return void
**/
function handleTag_dt() {
if (!$this->parser->isStartTag) {
$this->setLineBreaks(1);
}
}
/**
* handle <dd> tags
*
* @param void
* @return void
*/
function handleTag_dd() {
if ($this->parser->isStartTag) {
if (substr(ltrim($this->parser->html), 0, 3) == '<p>') {
# next comes a paragraph, so we'll need an extra line
$this->out("\n".$this->indent);
} elseif (substr($this->output, -2) == "\n\n") {
$this->output = substr($this->output, 0, -1);
}
$this->out(': ');
$this->indent(' ', false);
} else {
# lookahead for next dt
if (substr(ltrim($this->parser->html), 0, 4) == '<dt>') {
$this->setLineBreaks(2);
} else {
$this->setLineBreaks(1);
}
$this->indent(' ');
}
}
/**
* handle <fnref /> tags (custom footnote references, see markdownify_extra::parseString())
*
* @param void
* @return void
*/
function handleTag_fnref() {
$this->out('[^'.$this->parser->tagAttributes['target'].']');
}
/**
* handle <fn> tags (custom footnotes, see markdownify_extra::parseString()
* and markdownify_extra::_makeFootnotes())
*
* @param void
* @return void
*/
function handleTag_fn() {
if ($this->parser->isStartTag) {
$this->out('[^'.$this->parser->tagAttributes['name'].']:');
$this->setLineBreaks(1);
} else {
$this->setLineBreaks(2);
}
$this->indent(' ');
}
/**
* handle <footnotes> tag (custom footnotes, see markdownify_extra::parseString()
* and markdownify_extra::_makeFootnotes())
*
* @param void
* @return void
*/
function handleTag_footnotes() {
if (!$this->parser->isStartTag) {
$this->setLineBreaks(2);
}
}
/**
* parse a HTML string, clean up footnotes prior
*
* @param string $HTML input
* @return string Markdown formatted output
*/
function parseString($html) {
/** TODO: custom markdown-extra options, e.g. titles & classes **/
# <sup id="fnref:..."><a href"#fn..." rel="footnote">...</a></sup>
# => <fnref target="..." />
$html = preg_replace('@<sup id="fnref:([^"]+)">\s*<a href="#fn:\1" rel="footnote">\s*\d+\s*</a>\s*</sup>@Us', '<fnref target="$1" />', $html);
# <div class="footnotes">
# <hr />
# <ol>
#
# <li id="fn:...">...</li>
# ...
#
# </ol>
# </div>
# =>
# <footnotes>
# <fn name="...">...</fn>
# ...
# </footnotes>
$html = preg_replace_callback('#<div class="footnotes">\s*<hr />\s*<ol>\s*(.+)\s*</ol>\s*</div>#Us', array(&$this, '_makeFootnotes'), $html);
return parent::parseString($html);
}
/**
* replace HTML representation of footnotes with something more easily parsable
*
* @note this is a callback to be used in parseString()
*
* @param array $matches
* @return string
*/
function _makeFootnotes($matches) {
# <li id="fn:1">
# ...
# <a href="#fnref:block" rev="footnote">&#8617;</a></p>
# </li>
# => <fn name="1">...</fn>
# remove footnote link
$fns = preg_replace('@\s*(&#160;\s*)?<a href="#fnref:[^"]+" rev="footnote"[^>]*>&#8617;</a>\s*@s', '', $matches[1]);
# remove empty paragraph
$fns = preg_replace('@<p>\s*</p>@s', '', $fns);
# <li id="fn:1">...</li> -> <footnote nr="1">...</footnote>
$fns = str_replace('<li id="fn:', '<fn name="', $fns);
$fns = '<footnotes>'.$fns.'</footnotes>';
return preg_replace('#</li>\s*(?=(?:<fn|</footnotes>))#s', '</fn>$1', $fns);
}
}

View File

@ -1,618 +0,0 @@
<?php
/**
* parseHTML is a HTML parser which works with PHP 4 and above.
* It tries to handle invalid HTML to some degree.
*
* @version 1.0 beta
* @author Milian Wolff (mail@milianw.de, http://milianw.de)
* @license LGPL, see LICENSE_LGPL.txt and the summary below
* @copyright (C) 2007 Milian Wolff
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
class parseHTML {
/**
* tags which are always empty (<br /> etc.)
*
* @var array<string>
*/
var $emptyTags = array(
'br',
'hr',
'input',
'img',
'area',
'link',
'meta',
'param',
);
/**
* tags with preformatted text
* whitespaces wont be touched in them
*
* @var array<string>
*/
var $preformattedTags = array(
'script',
'style',
'pre',
'code',
);
/**
* supress HTML tags inside preformatted tags (see above)
*
* @var bool
*/
var $noTagsInCode = false;
/**
* html to be parsed
*
* @var string
*/
var $html = '';
/**
* node type:
*
* - tag (see isStartTag)
* - text (includes cdata)
* - comment
* - doctype
* - pi (processing instruction)
*
* @var string
*/
var $nodeType = '';
/**
* current node content, i.e. either a
* simple string (text node), or something like
* <tag attrib="value"...>
*
* @var string
*/
var $node = '';
/**
* wether current node is an opening tag (<a>) or not (</a>)
* set to NULL if current node is not a tag
* NOTE: empty tags (<br />) set this to true as well!
*
* @var bool | null
*/
var $isStartTag = null;
/**
* wether current node is an empty tag (<br />) or not (<a></a>)
*
* @var bool | null
*/
var $isEmptyTag = null;
/**
* tag name
*
* @var string | null
*/
var $tagName = '';
/**
* attributes of current tag
*
* @var array (attribName=>value) | null
*/
var $tagAttributes = null;
/**
* wether the current tag is a block element
*
* @var bool | null
*/
var $isBlockElement = null;
/**
* keep whitespace
*
* @var int
*/
var $keepWhitespace = 0;
/**
* list of open tags
* count this to get current depth
*
* @var array
*/
var $openTags = array();
/**
* list of block elements
*
* @var array
* TODO: what shall we do with <del> and <ins> ?!
*/
var $blockElements = array (
# tag name => <bool> is block
# block elements
'address' => true,
'blockquote' => true,
'center' => true,
'del' => true,
'dir' => true,
'div' => true,
'dl' => true,
'fieldset' => true,
'form' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'hr' => true,
'ins' => true,
'isindex' => true,
'menu' => true,
'noframes' => true,
'noscript' => true,
'ol' => true,
'p' => true,
'pre' => true,
'table' => true,
'ul' => true,
# set table elements and list items to block as well
'thead' => true,
'tbody' => true,
'tfoot' => true,
'td' => true,
'tr' => true,
'th' => true,
'li' => true,
'dd' => true,
'dt' => true,
# header items and html / body as well
'html' => true,
'body' => true,
'head' => true,
'meta' => true,
'link' => true,
'style' => true,
'title' => true,
# unfancy media tags, when indented should be rendered as block
'map' => true,
'object' => true,
'param' => true,
'embed' => true,
'area' => true,
# inline elements
'a' => false,
'abbr' => false,
'acronym' => false,
'applet' => false,
'b' => false,
'basefont' => false,
'bdo' => false,
'big' => false,
'br' => false,
'button' => false,
'cite' => false,
'code' => false,
'del' => false,
'dfn' => false,
'em' => false,
'font' => false,
'i' => false,
'img' => false,
'ins' => false,
'input' => false,
'iframe' => false,
'kbd' => false,
'label' => false,
'q' => false,
'samp' => false,
'script' => false,
'select' => false,
'small' => false,
'span' => false,
'strong' => false,
'sub' => false,
'sup' => false,
'textarea' => false,
'tt' => false,
'var' => false,
);
/**
* get next node, set $this->html prior!
*
* @param void
* @return bool
*/
function nextNode() {
if (empty($this->html)) {
# we are done with parsing the html string
return false;
}
static $skipWhitespace = true;
if ($this->isStartTag && !$this->isEmptyTag) {
array_push($this->openTags, $this->tagName);
if (in_array($this->tagName, $this->preformattedTags)) {
# dont truncate whitespaces for <code> or <pre> contents
$this->keepWhitespace++;
}
}
if ($this->html[0] == '<') {
$token = substr($this->html, 0, 9);
if (substr($token, 0, 2) == '<?') {
# xml prolog or other pi's
/** TODO **/
#trigger_error('this might need some work', E_USER_NOTICE);
$pos = strpos($this->html, '>');
$this->setNode('pi', $pos + 1);
return true;
}
if (substr($token, 0, 4) == '<!--') {
# comment
$pos = strpos($this->html, '-->');
if ($pos === false) {
# could not find a closing -->, use next gt instead
# this is firefox' behaviour
$pos = strpos($this->html, '>') + 1;
} else {
$pos += 3;
}
$this->setNode('comment', $pos);
$skipWhitespace = true;
return true;
}
if ($token == '<!DOCTYPE') {
# doctype
$this->setNode('doctype', strpos($this->html, '>')+1);
$skipWhitespace = true;
return true;
}
if ($token == '<![CDATA[') {
# cdata, use text node
# remove leading <![CDATA[
$this->html = substr($this->html, 9);
$this->setNode('text', strpos($this->html, ']]>')+3);
# remove trailing ]]> and trim
$this->node = substr($this->node, 0, -3);
$this->handleWhitespaces();
$skipWhitespace = true;
return true;
}
if ($this->parseTag()) {
# seems to be a tag
# handle whitespaces
if ($this->isBlockElement) {
$skipWhitespace = true;
} else {
$skipWhitespace = false;
}
return true;
}
}
if ($this->keepWhitespace) {
$skipWhitespace = false;
}
# when we get here it seems to be a text node
$pos = strpos($this->html, '<');
if ($pos === false) {
$pos = strlen($this->html);
}
$this->setNode('text', $pos);
$this->handleWhitespaces();
if ($skipWhitespace && $this->node == ' ') {
return $this->nextNode();
}
$skipWhitespace = false;
return true;
}
/**
* parse tag, set tag name and attributes, see if it's a closing tag and so forth...
*
* @param void
* @return bool
*/
function parseTag() {
static $a_ord, $z_ord, $special_ords;
if (!isset($a_ord)) {
$a_ord = ord('a');
$z_ord = ord('z');
$special_ords = array(
ord(':'), // for xml:lang
ord('-'), // for http-equiv
);
}
$tagName = '';
$pos = 1;
$isStartTag = $this->html[$pos] != '/';
if (!$isStartTag) {
$pos++;
}
# get tagName
while (isset($this->html[$pos])) {
$pos_ord = ord(strtolower($this->html[$pos]));
if (($pos_ord >= $a_ord && $pos_ord <= $z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
$tagName .= $this->html[$pos];
$pos++;
} else {
$pos--;
break;
}
}
$tagName = strtolower($tagName);
if (empty($tagName) || !isset($this->blockElements[$tagName])) {
# something went wrong => invalid tag
$this->invalidTag();
return false;
}
if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
# we supress all HTML tags inside code tags
$this->invalidTag();
return false;
}
# get tag attributes
/** TODO: in html 4 attributes do not need to be quoted **/
$isEmptyTag = false;
$attributes = array();
$currAttrib = '';
while (isset($this->html[$pos+1])) {
$pos++;
# close tag
if ($this->html[$pos] == '>' || $this->html[$pos].$this->html[$pos+1] == '/>') {
if ($this->html[$pos] == '/') {
$isEmptyTag = true;
$pos++;
}
break;
}
$pos_ord = ord(strtolower($this->html[$pos]));
if ( ($pos_ord >= $a_ord && $pos_ord <= $z_ord) || in_array($pos_ord, $special_ords)) {
# attribute name
$currAttrib .= $this->html[$pos];
} elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
# drop whitespace
} elseif (in_array($this->html[$pos].$this->html[$pos+1], array('="', "='"))) {
# get attribute value
$pos++;
$await = $this->html[$pos]; # single or double quote
$pos++;
$value = '';
while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
$value .= $this->html[$pos];
$pos++;
}
$attributes[$currAttrib] = $value;
$currAttrib = '';
} else {
$this->invalidTag();
return false;
}
}
if ($this->html[$pos] != '>') {
$this->invalidTag();
return false;
}
if (!empty($currAttrib)) {
# html 4 allows something like <option selected> instead of <option selected="selected">
$attributes[$currAttrib] = $currAttrib;
}
if (!$isStartTag) {
if (!empty($attributes) || $tagName != end($this->openTags)) {
# end tags must not contain any attributes
# or maybe we did not expect a different tag to be closed
$this->invalidTag();
return false;
}
array_pop($this->openTags);
if (in_array($tagName, $this->preformattedTags)) {
$this->keepWhitespace--;
}
}
$pos++;
$this->node = substr($this->html, 0, $pos);
$this->html = substr($this->html, $pos);
$this->tagName = $tagName;
$this->tagAttributes = $attributes;
$this->isStartTag = $isStartTag;
$this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
if ($this->isEmptyTag) {
# might be not well formed
$this->node = preg_replace('# */? *>$#', ' />', $this->node);
}
$this->nodeType = 'tag';
$this->isBlockElement = $this->blockElements[$tagName];
return true;
}
/**
* handle invalid tags
*
* @param void
* @return void
*/
function invalidTag() {
$this->html = substr_replace($this->html, '&lt;', 0, 1);
}
/**
* update all vars and make $this->html shorter
*
* @param string $type see description for $this->nodeType
* @param int $pos to which position shall we cut?
* @return void
*/
function setNode($type, $pos) {
if ($this->nodeType == 'tag') {
# set tag specific vars to null
# $type == tag should not be called here
# see this::parseTag() for more
$this->tagName = null;
$this->tagAttributes = null;
$this->isStartTag = null;
$this->isEmptyTag = null;
$this->isBlockElement = null;
}
$this->nodeType = $type;
$this->node = substr($this->html, 0, $pos);
$this->html = substr($this->html, $pos);
}
/**
* check if $this->html begins with $str
*
* @param string $str
* @return bool
*/
function match($str) {
return substr($this->html, 0, strlen($str)) == $str;
}
/**
* truncate whitespaces
*
* @param void
* @return void
*/
function handleWhitespaces() {
if ($this->keepWhitespace) {
# <pre> or <code> before...
return;
}
# truncate multiple whitespaces to a single one
$this->node = preg_replace('#\s+#s', ' ', $this->node);
}
/**
* normalize self::node
*
* @param void
* @return void
*/
function normalizeNode() {
$this->node = '<';
if (!$this->isStartTag) {
$this->node .= '/'.$this->tagName.'>';
return;
}
$this->node .= $this->tagName;
foreach ($this->tagAttributes as $name => $value) {
$this->node .= ' '.$name.'="'.str_replace('"', '&quot;', $value).'"';
}
if ($this->isEmptyTag) {
$this->node .= ' /';
}
$this->node .= '>';
}
}
/**
* indent a HTML string properly
*
* @param string $html
* @param string $indent optional
* @return string
*/
function indentHTML($html, $indent = " ", $noTagsInCode = false) {
$parser = new parseHTML;
$parser->noTagsInCode = $noTagsInCode;
$parser->html = $html;
$html = '';
$last = true; # last tag was block elem
$indent_a = array();
while($parser->nextNode()) {
if ($parser->nodeType == 'tag') {
$parser->normalizeNode();
}
if ($parser->nodeType == 'tag' && $parser->isBlockElement) {
$isPreOrCode = in_array($parser->tagName, array('code', 'pre'));
if (!$parser->keepWhitespace && !$last && !$isPreOrCode) {
$html = rtrim($html)."\n";
}
if ($parser->isStartTag) {
$html .= implode($indent_a);
if (!$parser->isEmptyTag) {
array_push($indent_a, $indent);
}
} else {
array_pop($indent_a);
if (!$isPreOrCode) {
$html .= implode($indent_a);
}
}
$html .= $parser->node;
if (!$parser->keepWhitespace && !($isPreOrCode && $parser->isStartTag)) {
$html .= "\n";
}
$last = true;
} else {
if ($parser->nodeType == 'tag' && $parser->tagName == 'br') {
$html .= $parser->node."\n";
$last = true;
continue;
} elseif ($last && !$parser->keepWhitespace) {
$html .= implode($indent_a);
$parser->node = ltrim($parser->node);
}
$html .= $parser->node;
if (in_array($parser->nodeType, array('comment', 'pi', 'doctype'))) {
$html .= "\n";
} else {
$last = false;
}
}
}
return $html;
}
/*
# testcase / example
error_reporting(E_ALL);
$html = '<p>Simple block on one line:</p>
<div>foo</div>
<p>And nested without indentation:</p>
<div>
<div>
<div>
foo
</div>
<div style=">"/>
</div>
<div>bar</div>
</div>
<p>And with attributes:</p>
<div>
<div id="foo">
</div>
</div>
<p>This was broken in 1.0.2b7:</p>
<div class="inlinepage">
<div class="toggleableend">
foo
</div>
</div>';
#$html = '<a href="asdfasdf" title=\'asdf\' foo="bar">asdf</a>';
echo indentHTML($html);
die();
*/

View File

@ -7,6 +7,9 @@ $baseDir = dirname($vendorDir);
return array(
'Hubzilla\\Import\\Import' => $baseDir . '/include/Import/Importer.php',
'Markdownify\\Converter' => $vendorDir . '/pixel418/markdownify/src/Converter.php',
'Markdownify\\ConverterExtra' => $vendorDir . '/pixel418/markdownify/src/ConverterExtra.php',
'Markdownify\\Parser' => $vendorDir . '/pixel418/markdownify/src/Parser.php',
'Michelf\\Markdown' => $vendorDir . '/michelf/php-markdown/Michelf/Markdown.php',
'Michelf\\MarkdownExtra' => $vendorDir . '/michelf/php-markdown/Michelf/MarkdownExtra.php',
'Michelf\\MarkdownInterface' => $vendorDir . '/michelf/php-markdown/Michelf/MarkdownInterface.php',
@ -18,6 +21,8 @@ return array(
'Psr\\Log\\LoggerInterface' => $vendorDir . '/psr/log/Psr/Log/LoggerInterface.php',
'Psr\\Log\\LoggerTrait' => $vendorDir . '/psr/log/Psr/Log/LoggerTrait.php',
'Psr\\Log\\NullLogger' => $vendorDir . '/psr/log/Psr/Log/NullLogger.php',
'Psr\\Log\\Test\\DummyTest' => $vendorDir . '/psr/log/Psr/Log/Test/LoggerInterfaceTest.php',
'Psr\\Log\\Test\\LoggerInterfaceTest' => $vendorDir . '/psr/log/Psr/Log/Test/LoggerInterfaceTest.php',
'Sabre\\CalDAV\\Backend\\AbstractBackend' => $vendorDir . '/sabre/dav/lib/CalDAV/Backend/AbstractBackend.php',
'Sabre\\CalDAV\\Backend\\BackendInterface' => $vendorDir . '/sabre/dav/lib/CalDAV/Backend/BackendInterface.php',
'Sabre\\CalDAV\\Backend\\NotificationSupport' => $vendorDir . '/sabre/dav/lib/CalDAV/Backend/NotificationSupport.php',
@ -268,7 +273,6 @@ return array(
'Sabre\\HTTP\\URLUtil' => $vendorDir . '/sabre/http/lib/URLUtil.php',
'Sabre\\HTTP\\Util' => $vendorDir . '/sabre/http/lib/Util.php',
'Sabre\\HTTP\\Version' => $vendorDir . '/sabre/http/lib/Version.php',
'Sabre\\Uri\\InvalidUriException' => $vendorDir . '/sabre/uri/lib/InvalidUriException.php',
'Sabre\\Uri\\Version' => $vendorDir . '/sabre/uri/lib/Version.php',
'Sabre\\VObject\\BirthdayCalendarGenerator' => $vendorDir . '/sabre/vobject/lib/BirthdayCalendarGenerator.php',
'Sabre\\VObject\\Cli' => $vendorDir . '/sabre/vobject/lib/Cli.php',
@ -357,6 +361,9 @@ return array(
'Sabre\\Xml\\Writer' => $vendorDir . '/sabre/xml/lib/Writer.php',
'Sabre\\Xml\\XmlDeserializable' => $vendorDir . '/sabre/xml/lib/XmlDeserializable.php',
'Sabre\\Xml\\XmlSerializable' => $vendorDir . '/sabre/xml/lib/XmlSerializable.php',
'Test\\Markdownify\\ConverterExtraTest' => $vendorDir . '/pixel418/markdownify/test/ConverterExtraTest.php',
'Test\\Markdownify\\ConverterTest' => $vendorDir . '/pixel418/markdownify/test/ConverterTest.php',
'Test\\Markdownify\\ConverterTestCase' => $vendorDir . '/pixel418/markdownify/test/ConverterTestCase.php',
'Zotlabs\\Access\\AccessList' => $baseDir . '/Zotlabs/Access/AccessList.php',
'Zotlabs\\Access\\PermissionLimits' => $baseDir . '/Zotlabs/Access/PermissionLimits.php',
'Zotlabs\\Access\\PermissionRoles' => $baseDir . '/Zotlabs/Access/PermissionRoles.php',

View File

@ -7,6 +7,7 @@ $baseDir = dirname($vendorDir);
return array(
'Zotlabs\\' => array($baseDir . '/Zotlabs'),
'Test\\Markdownify\\' => array($vendorDir . '/pixel418/markdownify/test'),
'Sabre\\Xml\\' => array($vendorDir . '/sabre/xml/lib'),
'Sabre\\VObject\\' => array($vendorDir . '/sabre/vobject/lib'),
'Sabre\\Uri\\' => array($vendorDir . '/sabre/uri/lib'),
@ -17,5 +18,6 @@ return array(
'Sabre\\CardDAV\\' => array($vendorDir . '/sabre/dav/lib/CardDAV'),
'Sabre\\CalDAV\\' => array($vendorDir . '/sabre/dav/lib/CalDAV'),
'Psr\\Log\\' => array($vendorDir . '/psr/log/Psr/Log'),
'Markdownify\\' => array($vendorDir . '/pixel418/markdownify/src'),
'Hubzilla\\' => array($baseDir . '/include'),
);

View File

@ -21,6 +21,10 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
array (
'Zotlabs\\' => 8,
),
'T' =>
array (
'Test\\Markdownify\\' => 17,
),
'S' =>
array (
'Sabre\\Xml\\' => 10,
@ -37,6 +41,10 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
array (
'Psr\\Log\\' => 8,
),
'M' =>
array (
'Markdownify\\' => 12,
),
'H' =>
array (
'Hubzilla\\' => 9,
@ -48,6 +56,10 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
array (
0 => __DIR__ . '/../..' . '/Zotlabs',
),
'Test\\Markdownify\\' =>
array (
0 => __DIR__ . '/..' . '/pixel418/markdownify/test',
),
'Sabre\\Xml\\' =>
array (
0 => __DIR__ . '/..' . '/sabre/xml/lib',
@ -88,6 +100,10 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
array (
0 => __DIR__ . '/..' . '/psr/log/Psr/Log',
),
'Markdownify\\' =>
array (
0 => __DIR__ . '/..' . '/pixel418/markdownify/src',
),
'Hubzilla\\' =>
array (
0 => __DIR__ . '/../..' . '/include',
@ -106,6 +122,9 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
public static $classMap = array (
'Hubzilla\\Import\\Import' => __DIR__ . '/../..' . '/include/Import/Importer.php',
'Markdownify\\Converter' => __DIR__ . '/..' . '/pixel418/markdownify/src/Converter.php',
'Markdownify\\ConverterExtra' => __DIR__ . '/..' . '/pixel418/markdownify/src/ConverterExtra.php',
'Markdownify\\Parser' => __DIR__ . '/..' . '/pixel418/markdownify/src/Parser.php',
'Michelf\\Markdown' => __DIR__ . '/..' . '/michelf/php-markdown/Michelf/Markdown.php',
'Michelf\\MarkdownExtra' => __DIR__ . '/..' . '/michelf/php-markdown/Michelf/MarkdownExtra.php',
'Michelf\\MarkdownInterface' => __DIR__ . '/..' . '/michelf/php-markdown/Michelf/MarkdownInterface.php',
@ -117,6 +136,8 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
'Psr\\Log\\LoggerInterface' => __DIR__ . '/..' . '/psr/log/Psr/Log/LoggerInterface.php',
'Psr\\Log\\LoggerTrait' => __DIR__ . '/..' . '/psr/log/Psr/Log/LoggerTrait.php',
'Psr\\Log\\NullLogger' => __DIR__ . '/..' . '/psr/log/Psr/Log/NullLogger.php',
'Psr\\Log\\Test\\DummyTest' => __DIR__ . '/..' . '/psr/log/Psr/Log/Test/LoggerInterfaceTest.php',
'Psr\\Log\\Test\\LoggerInterfaceTest' => __DIR__ . '/..' . '/psr/log/Psr/Log/Test/LoggerInterfaceTest.php',
'Sabre\\CalDAV\\Backend\\AbstractBackend' => __DIR__ . '/..' . '/sabre/dav/lib/CalDAV/Backend/AbstractBackend.php',
'Sabre\\CalDAV\\Backend\\BackendInterface' => __DIR__ . '/..' . '/sabre/dav/lib/CalDAV/Backend/BackendInterface.php',
'Sabre\\CalDAV\\Backend\\NotificationSupport' => __DIR__ . '/..' . '/sabre/dav/lib/CalDAV/Backend/NotificationSupport.php',
@ -367,7 +388,6 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
'Sabre\\HTTP\\URLUtil' => __DIR__ . '/..' . '/sabre/http/lib/URLUtil.php',
'Sabre\\HTTP\\Util' => __DIR__ . '/..' . '/sabre/http/lib/Util.php',
'Sabre\\HTTP\\Version' => __DIR__ . '/..' . '/sabre/http/lib/Version.php',
'Sabre\\Uri\\InvalidUriException' => __DIR__ . '/..' . '/sabre/uri/lib/InvalidUriException.php',
'Sabre\\Uri\\Version' => __DIR__ . '/..' . '/sabre/uri/lib/Version.php',
'Sabre\\VObject\\BirthdayCalendarGenerator' => __DIR__ . '/..' . '/sabre/vobject/lib/BirthdayCalendarGenerator.php',
'Sabre\\VObject\\Cli' => __DIR__ . '/..' . '/sabre/vobject/lib/Cli.php',
@ -456,6 +476,9 @@ class ComposerStaticInit7b34d7e50a62201ec5d5e526a5b8b35d
'Sabre\\Xml\\Writer' => __DIR__ . '/..' . '/sabre/xml/lib/Writer.php',
'Sabre\\Xml\\XmlDeserializable' => __DIR__ . '/..' . '/sabre/xml/lib/XmlDeserializable.php',
'Sabre\\Xml\\XmlSerializable' => __DIR__ . '/..' . '/sabre/xml/lib/XmlSerializable.php',
'Test\\Markdownify\\ConverterExtraTest' => __DIR__ . '/..' . '/pixel418/markdownify/test/ConverterExtraTest.php',
'Test\\Markdownify\\ConverterTest' => __DIR__ . '/..' . '/pixel418/markdownify/test/ConverterTest.php',
'Test\\Markdownify\\ConverterTestCase' => __DIR__ . '/..' . '/pixel418/markdownify/test/ConverterTestCase.php',
'Zotlabs\\Access\\AccessList' => __DIR__ . '/../..' . '/Zotlabs/Access/AccessList.php',
'Zotlabs\\Access\\PermissionLimits' => __DIR__ . '/../..' . '/Zotlabs/Access/PermissionLimits.php',
'Zotlabs\\Access\\PermissionRoles' => __DIR__ . '/../..' . '/Zotlabs/Access/PermissionRoles.php',

View File

@ -518,5 +518,63 @@
"keywords": [
"markdown"
]
},
{
"name": "pixel418/markdownify",
"version": "v2.2.1",
"version_normalized": "2.2.1.0",
"source": {
"type": "git",
"url": "https://github.com/Elephant418/Markdownify.git",
"reference": "0160677f04c784550dd10fd72fdf3994967db848"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/Elephant418/Markdownify/zipball/0160677f04c784550dd10fd72fdf3994967db848",
"reference": "0160677f04c784550dd10fd72fdf3994967db848",
"shasum": ""
},
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8"
},
"time": "2016-09-21T13:01:43+00:00",
"type": "lib",
"installation-source": "dist",
"autoload": {
"psr-4": {
"Markdownify\\": "src",
"Test\\Markdownify\\": "test"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"LGPL"
],
"authors": [
{
"name": "Peter Kruithof",
"email": "pkruithof@gmail.com",
"homepage": "http://pkruithof.tumblr.com/"
},
{
"name": "Milian Wolff",
"email": "mail@milianw.de",
"homepage": "http://milianw.de"
},
{
"name": "Thomas Zilliox",
"email": "hello@tzi.fr",
"homepage": "http://tzi.fr"
}
],
"description": "The HTML to Markdown converter for PHP ",
"homepage": "https://github.com/elephant418/Markdownify",
"keywords": [
"markdown",
"markdownify"
]
}
]

View File

@ -0,0 +1,76 @@
CHANGELOG
==============
21/09/2016 v2.2.1
--------------
* Fix: Moving trailing whitespace from inline elements outside of the element
* Feature: Use PSR-4
* Feature: PHP 7.0 support in continuous integration
* Doc: Update of the README
07/09/2016 v2.2.0
--------------
* Fix: Reset state between each parsing
19/02/2016 v2.1.11
--------------
* Fix: Empty table cell conversion
10/02/2016 v2.1.10
--------------
* Fix: Handle nested table.
01/04/2015 v2.1.9
--------------
* Fix: Handle HTML breaks & spaces in a less destructive way.
26/03/2015 v2.1.8
--------------
* Fix: Use alternative italic character
* Fix: Handle HTML breaks inside another tag
* Fix: Handle HTML spaces around tags
07/11/2014 v2.1.7
--------------
* Change composer name to "elephant418/markdownify"
14/07/2014 v2.1.6
--------------
* Fix: Simulate a paragraph for inline text preceding block element
* Fix: Nested lists
* Fix: setKeepHTML method
* Feature: PHP 5.5 & 5.6 support in continuous integration
16/03/2014 v2.1.5
--------------
Add display settings
* Test: Add tests for footnotes after every paragraph or not
* Feature: Allow to display link reference in paragraph, without footnotes
27/02/2014 v2.1.4
--------------
Improve how ConverterExtra handle id & class attributes:
* Feature: Allow id & class attributes on links
* Feature: Allow class attributes on headings

63
vendor/pixel418/markdownify/README.md vendored Normal file
View File

@ -0,0 +1,63 @@
# Markdownify
[![Build Status](https://travis-ci.org/Elephant418/Markdownify.png?branch=master)](https://travis-ci.org/Elephant418/Markdownify?branch=master)
[![Total Downloads](https://poser.pugx.org/pixel418/markdownify/downloads)](https://packagist.org/packages/pixel418/markdownify)
[![License LGPL](https://poser.pugx.org/pixel418/markdownify/license)](https://opensource.org/licenses/lgpl-2.1.php)
The HTML to Markdown converter for PHP
[Code example](#code-example) | [How to Install](#how-to-install) | [How to Contribute](#how-to-contribute) | [Author & Community](#author--community)
Code example
--------
### Markdown
```php
$converter = new Markdownify\Converter;
$converter->parseString('<h1>Heading</h1>');
// Returns: # Heading
```
### Markdown Extra [as defined by @michelf](http://michelf.ca/projects/php-markdown/extra/)
```php
$converter = new Markdownify\ConverterExtra;
$converter->parseString('<h1 id="md">Heading</h1>');
// Returns: # Heading {#md}
```
How to Install
--------
This library package requires `PHP 5.3` or later.<br>
Install [Composer](http://getcomposer.org/doc/01-basic-usage.md#installation) and run the following command to get the latest version:
```sh
composer require pixel418/markdownify
```
How to Contribute
--------
1. Fork the Markdownify repository
2. Create a new branch for each feature or improvement
3. Send a pull request from each feature branch to the **v2.x** branch
If you don't know much about pull request, you can read [the Github article](https://help.github.com/articles/using-pull-requests)
Author & Community
--------
Markdownify is under [LGPL License](http://opensource.org/licenses/LGPL-2.1)<br>
It was created by [Milian Wolff](http://milianw.de)<br>
It was converted to a Symfony Bundle by [Peter Kruithof](https://github.com/pkruithof)<br>
It is maintained by [Thomas ZILLIOX](http://tzi.fr)

View File

@ -0,0 +1,38 @@
{
"name": "pixel418/markdownify",
"type": "lib",
"description": "The HTML to Markdown converter for PHP ",
"keywords": ["markdown", "markdownify"],
"license": "LGPL",
"homepage": "https://github.com/elephant418/Markdownify",
"authors": [
{
"name": "Milian Wolff",
"email": "mail@milianw.de",
"homepage": "http://milianw.de"
},
{
"name": "Peter Kruithof",
"email": "pkruithof@gmail.com",
"homepage": "http://pkruithof.tumblr.com/"
},
{
"name": "Thomas Zilliox",
"email": "hello@tzi.fr",
"homepage": "http://tzi.fr"
}
],
"require": {
"php": ">=5.3.0"
},
"require-dev": {
"phpunit/phpunit": "^4.8"
},
"autoload": {
"psr-4": {
"Markdownify\\": "src",
"Test\\Markdownify\\": "test"
}
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,573 @@
<?php
/* This file is part of the Markdownify project, which is under LGPL license */
namespace Markdownify;
class ConverterExtra extends Converter
{
/**
* table data, including rows with content and the maximum width of each col
*
* @var array
*/
protected $table = array();
/**
* current col
*
* @var int
*/
protected $col = -1;
/**
* current row
*
* @var int
*/
protected $row = 0;
/**
* constructor, see Markdownify::Markdownify() for more information
*/
public function __construct($linksAfterEachParagraph = self::LINK_AFTER_CONTENT, $bodyWidth = MDFY_BODYWIDTH, $keepHTML = MDFY_KEEPHTML)
{
parent::__construct($linksAfterEachParagraph, $bodyWidth, $keepHTML);
// new markdownable tags & attributes
// header ids: # foo {bar}
$this->isMarkdownable['h1']['id'] = 'optional';
$this->isMarkdownable['h1']['class'] = 'optional';
$this->isMarkdownable['h2']['id'] = 'optional';
$this->isMarkdownable['h2']['class'] = 'optional';
$this->isMarkdownable['h3']['id'] = 'optional';
$this->isMarkdownable['h3']['class'] = 'optional';
$this->isMarkdownable['h4']['id'] = 'optional';
$this->isMarkdownable['h4']['class'] = 'optional';
$this->isMarkdownable['h5']['id'] = 'optional';
$this->isMarkdownable['h5']['class'] = 'optional';
$this->isMarkdownable['h6']['id'] = 'optional';
$this->isMarkdownable['h6']['class'] = 'optional';
// tables
$this->isMarkdownable['table'] = array();
$this->isMarkdownable['th'] = array(
'align' => 'optional',
);
$this->isMarkdownable['td'] = array(
'align' => 'optional',
);
$this->isMarkdownable['tr'] = array();
array_push($this->ignore, 'thead');
array_push($this->ignore, 'tbody');
array_push($this->ignore, 'tfoot');
// definition lists
$this->isMarkdownable['dl'] = array();
$this->isMarkdownable['dd'] = array();
$this->isMarkdownable['dt'] = array();
// link class
$this->isMarkdownable['a']['id'] = 'optional';
$this->isMarkdownable['a']['class'] = 'optional';
// footnotes
$this->isMarkdownable['fnref'] = array(
'target' => 'required',
);
$this->isMarkdownable['footnotes'] = array();
$this->isMarkdownable['fn'] = array(
'name' => 'required',
);
$this->parser->blockElements['fnref'] = false;
$this->parser->blockElements['fn'] = true;
$this->parser->blockElements['footnotes'] = true;
// abbr
$this->isMarkdownable['abbr'] = array(
'title' => 'required',
);
// build RegEx lookahead to decide wether table can pe parsed or not
$inlineTags = array_keys($this->parser->blockElements, false);
$colContents = '(?:[^<]|<(?:' . implode('|', $inlineTags) . '|[^a-z]))*';
$this->tableLookaheadHeader = '{
^\s*(?:<thead\s*>)?\s* # open optional thead
<tr\s*>\s*(?: # start required row with headers
<th(?:\s+align=("|\')(?:left|center|right)\1)?\s*> # header with optional align
\s*' . $colContents . '\s* # contents
</th>\s* # close header
)+</tr> # close row with headers
\s*(?:</thead>)? # close optional thead
}sxi';
$this->tdSubstitute = '\s*' . $colContents . '\s* # contents
</td>\s*';
$this->tableLookaheadBody = '{
\s*(?:<tbody\s*>)?\s* # open optional tbody
(?:<tr\s*>\s* # start row
%s # cols to be substituted
</tr>)+ # close row
\s*(?:</tbody>)? # close optional tbody
\s*</table> # close table
}sxi';
}
/**
* handle header tags (<h1> - <h6>)
*
* @param int $level 1-6
* @return void
*/
protected function handleHeader($level)
{
if ($this->parser->isStartTag) {
$this->parser->tagAttributes['cssSelector'] = $this->getCurrentCssSelector();
$this->stack();
} else {
$tag = $this->unstack();
if (!empty($tag['cssSelector'])) {
// {#id.class}
$this->out(' {' . $tag['cssSelector'] . '}');
}
}
parent::handleHeader($level);
}
/**
* handle <a> tags parsing
*
* @param void
* @return void
*/
protected function handleTag_a_parser()
{
parent::handleTag_a_parser();
$this->parser->tagAttributes['cssSelector'] = $this->getCurrentCssSelector();
}
/**
* handle <a> tags conversion
*
* @param array $tag
* @param string $buffer
* @return string The markdownified link
*/
protected function handleTag_a_converter($tag, $buffer)
{
$output = parent::handleTag_a_converter($tag, $buffer);
if (!empty($tag['cssSelector'])) {
// [This link][id]{#id.class}
$output .= '{' . $tag['cssSelector'] . '}';
}
return $output;
}
/**
* handle <abbr> tags
*
* @param void
* @return void
*/
protected function handleTag_abbr()
{
if ($this->parser->isStartTag) {
$this->stack();
$this->buffer();
} else {
$tag = $this->unstack();
$tag['text'] = $this->unbuffer();
$add = true;
foreach ($this->stack['abbr'] as $stacked) {
if ($stacked['text'] == $tag['text']) {
/** TODO: differing abbr definitions, i.e. different titles for same text **/
$add = false;
break;
}
}
$this->out($tag['text']);
if ($add) {
array_push($this->stack['abbr'], $tag);
}
}
}
/**
* flush stacked abbr tags
*
* @param void
* @return void
*/
protected function flushStacked_abbr()
{
$out = array();
foreach ($this->stack['abbr'] as $k => $tag) {
if (!isset($tag['unstacked'])) {
array_push($out, ' *[' . $tag['text'] . ']: ' . $tag['title']);
$tag['unstacked'] = true;
$this->stack['abbr'][$k] = $tag;
}
}
if (!empty($out)) {
$this->out("\n\n" . implode("\n", $out));
}
}
/**
* handle <table> tags
*
* @param void
* @return void
*/
protected function handleTag_table()
{
if ($this->parser->isStartTag) {
// check if upcoming table can be converted
if ($this->keepHTML) {
if (preg_match($this->tableLookaheadHeader, $this->parser->html, $matches)) {
// header seems good, now check body
// get align & number of cols
preg_match_all('#<th(?:\s+align=("|\')(left|right|center)\1)?\s*>#si', $matches[0], $cols);
$regEx = '';
$i = 1;
$aligns = array();
foreach ($cols[2] as $align) {
$align = strtolower($align);
array_push($aligns, $align);
if (empty($align)) {
$align = 'left'; // default value
}
$td = '\s+align=("|\')' . $align . '\\' . $i;
$i++;
if ($align == 'left') {
// look for empty align or left
$td = '(?:' . $td . ')?';
}
$td = '<td' . $td . '\s*>';
$regEx .= $td . $this->tdSubstitute;
}
$regEx = sprintf($this->tableLookaheadBody, $regEx);
if (preg_match($regEx, $this->parser->html, $matches, null, strlen($matches[0]))) {
// this is a markdownable table tag!
$this->table = array(
'rows' => array(),
'col_widths' => array(),
'aligns' => $aligns,
);
$this->row = 0;
} else {
// non markdownable table
$this->handleTagToText();
}
} else {
// non markdownable table
$this->handleTagToText();
}
} else {
$this->table = array(
'rows' => array(),
'col_widths' => array(),
'aligns' => array(),
);
$this->row = 0;
}
} else {
// finally build the table in Markdown Extra syntax
$separator = array();
if (!isset($this->table['aligns'])) {
$this->table['aligns'] = array();
}
// seperator with correct align identifiers
foreach ($this->table['aligns'] as $col => $align) {
if (!$this->keepHTML && !isset($this->table['col_widths'][$col])) {
break;
}
$left = ' ';
$right = ' ';
switch ($align) {
case 'left':
$left = ':';
break;
case 'center':
$right = ':';
$left = ':';
case 'right':
$right = ':';
break;
}
array_push($separator, $left . str_repeat('-', $this->table['col_widths'][$col]) . $right);
}
$separator = '|' . implode('|', $separator) . '|';
$rows = array();
// add padding
array_walk_recursive($this->table['rows'], array(&$this, 'alignTdContent'));
$header = array_shift($this->table['rows']);
array_push($rows, '| ' . implode(' | ', $header) . ' |');
array_push($rows, $separator);
foreach ($this->table['rows'] as $row) {
array_push($rows, '| ' . implode(' | ', $row) . ' |');
}
$this->out(implode("\n" . $this->indent, $rows));
$this->table = array();
$this->setLineBreaks(2);
}
}
/**
* properly pad content so it is aligned as whished
* should be used with array_walk_recursive on $this->table['rows']
*
* @param string &$content
* @param int $col
* @return void
*/
protected function alignTdContent(&$content, $col)
{
if (!isset($this->table['aligns'][$col])) {
$this->table['aligns'][$col] = 'left';
}
switch ($this->table['aligns'][$col]) {
default:
case 'left':
$content .= str_repeat(' ', $this->table['col_widths'][$col] - $this->strlen($content));
break;
case 'right':
$content = str_repeat(' ', $this->table['col_widths'][$col] - $this->strlen($content)) . $content;
break;
case 'center':
$paddingNeeded = $this->table['col_widths'][$col] - $this->strlen($content);
$left = floor($paddingNeeded / 2);
$right = $paddingNeeded - $left;
$content = str_repeat(' ', $left) . $content . str_repeat(' ', $right);
break;
}
}
/**
* handle <tr> tags
*
* @param void
* @return void
*/
protected function handleTag_tr()
{
if ($this->parser->isStartTag) {
$this->col = -1;
} else {
$this->row++;
}
}
/**
* handle <td> tags
*
* @param void
* @return void
*/
protected function handleTag_td()
{
if ($this->parser->isStartTag) {
$this->col++;
if (!isset($this->table['col_widths'][$this->col])) {
$this->table['col_widths'][$this->col] = 0;
}
$this->buffer();
} else {
$buffer = trim($this->unbuffer());
if (!isset($this->table['col_widths'][$this->col])) {
$this->table['col_widths'][$this->col] = 0;
}
$this->table['col_widths'][$this->col] = max($this->table['col_widths'][$this->col], $this->strlen($buffer));
$this->table['rows'][$this->row][$this->col] = $buffer;
}
}
/**
* handle <th> tags
*
* @param void
* @return void
*/
protected function handleTag_th()
{
if (!$this->keepHTML && !isset($this->table['rows'][1]) && !isset($this->table['aligns'][$this->col + 1])) {
if (isset($this->parser->tagAttributes['align'])) {
$this->table['aligns'][$this->col + 1] = $this->parser->tagAttributes['align'];
} else {
$this->table['aligns'][$this->col + 1] = '';
}
}
$this->handleTag_td();
}
/**
* handle <dl> tags
*
* @param void
* @return void
*/
protected function handleTag_dl()
{
if (!$this->parser->isStartTag) {
$this->setLineBreaks(2);
}
}
/**
* handle <dt> tags
*
* @param void
* @return void
**/
protected function handleTag_dt()
{
if (!$this->parser->isStartTag) {
$this->setLineBreaks(1);
}
}
/**
* handle <dd> tags
*
* @param void
* @return void
*/
protected function handleTag_dd()
{
if ($this->parser->isStartTag) {
if (substr(ltrim($this->parser->html), 0, 3) == '<p>') {
// next comes a paragraph, so we'll need an extra line
$this->out("\n" . $this->indent);
} elseif (substr($this->output, -2) == "\n\n") {
$this->output = substr($this->output, 0, -1);
}
$this->out(': ');
$this->indent(' ', false);
} else {
// lookahead for next dt
if (substr(ltrim($this->parser->html), 0, 4) == '<dt>') {
$this->setLineBreaks(2);
} else {
$this->setLineBreaks(1);
}
$this->indent(' ');
}
}
/**
* handle <fnref /> tags (custom footnote references, see markdownify_extra::parseString())
*
* @param void
* @return void
*/
protected function handleTag_fnref()
{
$this->out('[^' . $this->parser->tagAttributes['target'] . ']');
}
/**
* handle <fn> tags (custom footnotes, see markdownify_extra::parseString()
* and markdownify_extra::_makeFootnotes())
*
* @param void
* @return void
*/
protected function handleTag_fn()
{
if ($this->parser->isStartTag) {
$this->out('[^' . $this->parser->tagAttributes['name'] . ']:');
$this->setLineBreaks(1);
} else {
$this->setLineBreaks(2);
}
$this->indent(' ');
}
/**
* handle <footnotes> tag (custom footnotes, see markdownify_extra::parseString()
* and markdownify_extra::_makeFootnotes())
*
* @param void
* @return void
*/
protected function handleTag_footnotes()
{
if (!$this->parser->isStartTag) {
$this->setLineBreaks(2);
}
}
/**
* parse a HTML string, clean up footnotes prior
*
* @param string $HTML input
* @return string Markdown formatted output
*/
public function parseString($html)
{
/** TODO: custom markdown-extra options, e.g. titles & classes **/
// <sup id="fnref:..."><a href"#fn..." rel="footnote">...</a></sup>
// => <fnref target="..." />
$html = preg_replace('@<sup id="fnref:([^"]+)">\s*<a href="#fn:\1" rel="footnote">\s*\d+\s*</a>\s*</sup>@Us', '<fnref target="$1" />', $html);
// <div class="footnotes">
// <hr />
// <ol>
//
// <li id="fn:...">...</li>
// ...
//
// </ol>
// </div>
// =>
// <footnotes>
// <fn name="...">...</fn>
// ...
// </footnotes>
$html = preg_replace_callback('#<div class="footnotes">\s*<hr />\s*<ol>\s*(.+)\s*</ol>\s*</div>#Us', array(&$this, '_makeFootnotes'), $html);
return parent::parseString($html);
}
/**
* replace HTML representation of footnotes with something more easily parsable
*
* @note this is a callback to be used in parseString()
*
* @param array $matches
* @return string
*/
protected function _makeFootnotes($matches)
{
// <li id="fn:1">
// ...
// <a href="#fnref:block" rev="footnote">&#8617;</a></p>
// </li>
// => <fn name="1">...</fn>
// remove footnote link
$fns = preg_replace('@\s*(&#160;\s*)?<a href="#fnref:[^"]+" rev="footnote"[^>]*>&#8617;</a>\s*@s', '', $matches[1]);
// remove empty paragraph
$fns = preg_replace('@<p>\s*</p>@s', '', $fns);
// <li id="fn:1">...</li> -> <footnote nr="1">...</footnote>
$fns = str_replace('<li id="fn:', '<fn name="', $fns);
$fns = '<footnotes>' . $fns . '</footnotes>';
return preg_replace('#</li>\s*(?=(?:<fn|</footnotes>))#s', '</fn>$1', $fns);
}
/**
* handle <a> tags parsing
*
* @param void
* @return void
*/
protected function getCurrentCssSelector()
{
$cssSelector = '';
if (isset($this->parser->tagAttributes['id'])) {
$cssSelector .= '#' . $this->decode($this->parser->tagAttributes['id']);
}
if (isset($this->parser->tagAttributes['class'])) {
$classes = explode(' ', $this->decode($this->parser->tagAttributes['class']));
$classes = array_filter($classes);
$cssSelector .= '.' . join('.', $classes);
}
return $cssSelector;
}
}

View File

@ -0,0 +1,564 @@
<?php
/* This file is part of the Markdownify project, which is under LGPL license */
namespace Markdownify;
class Parser
{
public static $skipWhitespace = true;
public static $a_ord;
public static $z_ord;
public static $special_ords;
/**
* tags which are always empty (<br /> etc.)
*
* @var array<string>
*/
public $emptyTags = array(
'br',
'hr',
'input',
'img',
'area',
'link',
'meta',
'param',
);
/**
* tags with preformatted text
* whitespaces wont be touched in them
*
* @var array<string>
*/
public $preformattedTags = array(
'script',
'style',
'pre',
'code',
);
/**
* supress HTML tags inside preformatted tags (see above)
*
* @var bool
*/
public $noTagsInCode = false;
/**
* html to be parsed
*
* @var string
*/
public $html = '';
/**
* node type:
*
* - tag (see isStartTag)
* - text (includes cdata)
* - comment
* - doctype
* - pi (processing instruction)
*
* @var string
*/
public $nodeType = '';
/**
* current node content, i.e. either a
* simple string (text node), or something like
* <tag attrib="value"...>
*
* @var string
*/
public $node = '';
/**
* wether current node is an opening tag (<a>) or not (</a>)
* set to NULL if current node is not a tag
* NOTE: empty tags (<br />) set this to true as well!
*
* @var bool | null
*/
public $isStartTag = null;
/**
* wether current node is an empty tag (<br />) or not (<a></a>)
*
* @var bool | null
*/
public $isEmptyTag = null;
/**
* tag name
*
* @var string | null
*/
public $tagName = '';
/**
* attributes of current tag
*
* @var array (attribName=>value) | null
*/
public $tagAttributes = null;
/**
* whether or not the actual context is a inline context
*
* @var bool | null
*/
public $isInlineContext = null;
/**
* whether the current tag is a block element
*
* @var bool | null
*/
public $isBlockElement = null;
/**
* whether the previous tag (browser) is a block element
*
* @var bool | null
*/
public $isNextToInlineContext = null;
/**
* keep whitespace
*
* @var int
*/
public $keepWhitespace = 0;
/**
* list of open tags
* count this to get current depth
*
* @var array
*/
public $openTags = array();
/**
* list of block elements
*
* @var array
* TODO: what shall we do with <del> and <ins> ?!
*/
public $blockElements = array(
// tag name => <bool> is block
// block elements
'address' => true,
'blockquote' => true,
'center' => true,
'del' => true,
'dir' => true,
'div' => true,
'dl' => true,
'fieldset' => true,
'form' => true,
'h1' => true,
'h2' => true,
'h3' => true,
'h4' => true,
'h5' => true,
'h6' => true,
'hr' => true,
'ins' => true,
'isindex' => true,
'menu' => true,
'noframes' => true,
'noscript' => true,
'ol' => true,
'p' => true,
'pre' => true,
'table' => true,
'ul' => true,
// set table elements and list items to block as well
'thead' => true,
'tbody' => true,
'tfoot' => true,
'td' => true,
'tr' => true,
'th' => true,
'li' => true,
'dd' => true,
'dt' => true,
// header items and html / body as well
'html' => true,
'body' => true,
'head' => true,
'meta' => true,
'link' => true,
'style' => true,
'title' => true,
// unfancy media tags, when indented should be rendered as block
'map' => true,
'object' => true,
'param' => true,
'embed' => true,
'area' => true,
// inline elements
'a' => false,
'abbr' => false,
'acronym' => false,
'applet' => false,
'b' => false,
'basefont' => false,
'bdo' => false,
'big' => false,
'br' => false,
'button' => false,
'cite' => false,
'code' => false,
'del' => false,
'dfn' => false,
'em' => false,
'font' => false,
'i' => false,
'img' => false,
'ins' => false,
'input' => false,
'iframe' => false,
'kbd' => false,
'label' => false,
'q' => false,
'samp' => false,
'script' => false,
'select' => false,
'small' => false,
'span' => false,
'strong' => false,
'sub' => false,
'sup' => false,
'textarea' => false,
'tt' => false,
'var' => false,
);
/**
* get next node, set $this->html prior!
*
* @param void
* @return bool
*/
public function nextNode()
{
if (empty($this->html)) {
// we are done with parsing the html string
return false;
}
if ($this->isStartTag && !$this->isEmptyTag) {
array_push($this->openTags, $this->tagName);
if (in_array($this->tagName, $this->preformattedTags)) {
// dont truncate whitespaces for <code> or <pre> contents
$this->keepWhitespace++;
}
}
if ($this->html[0] == '<') {
$token = substr($this->html, 0, 9);
if (substr($token, 0, 2) == '<?') {
// xml prolog or other pi's
/** TODO **/
// trigger_error('this might need some work', E_USER_NOTICE);
$pos = strpos($this->html, '>');
$this->setNode('pi', $pos + 1);
return true;
}
if (substr($token, 0, 4) == '<!--') {
// comment
$pos = strpos($this->html, '-->');
if ($pos === false) {
// could not find a closing -->, use next gt instead
// this is firefox' behaviour
$pos = strpos($this->html, '>') + 1;
} else {
$pos += 3;
}
$this->setNode('comment', $pos);
static::$skipWhitespace = true;
return true;
}
if ($token == '<!DOCTYPE') {
// doctype
$this->setNode('doctype', strpos($this->html, '>') + 1);
static::$skipWhitespace = true;
return true;
}
if ($token == '<![CDATA[') {
// cdata, use text node
// remove leading <![CDATA[
$this->html = substr($this->html, 9);
$this->setNode('text', strpos($this->html, ']]>') + 3);
// remove trailing ]]> and trim
$this->node = substr($this->node, 0, -3);
$this->handleWhitespaces();
static::$skipWhitespace = true;
return true;
}
if ($this->parseTag()) {
// seems to be a tag
// handle whitespaces
if ($this->isBlockElement) {
static::$skipWhitespace = true;
} else {
static::$skipWhitespace = false;
}
return true;
}
}
if ($this->keepWhitespace) {
static::$skipWhitespace = false;
}
// when we get here it seems to be a text node
$pos = strpos($this->html, '<');
if ($pos === false) {
$pos = strlen($this->html);
}
$this->setNode('text', $pos);
$this->handleWhitespaces();
if (static::$skipWhitespace && $this->node == ' ') {
return $this->nextNode();
}
$this->isInlineContext = true;
static::$skipWhitespace = false;
return true;
}
/**
* parse tag, set tag name and attributes, see if it's a closing tag and so forth...
*
* @param void
* @return bool
*/
protected function parseTag()
{
if (!isset(static::$a_ord)) {
static::$a_ord = ord('a');
static::$z_ord = ord('z');
static::$special_ords = array(
ord(':'), // for xml:lang
ord('-'), // for http-equiv
);
}
$tagName = '';
$pos = 1;
$isStartTag = $this->html[$pos] != '/';
if (!$isStartTag) {
$pos++;
}
// get tagName
while (isset($this->html[$pos])) {
$pos_ord = ord(strtolower($this->html[$pos]));
if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || (!empty($tagName) && is_numeric($this->html[$pos]))) {
$tagName .= $this->html[$pos];
$pos++;
} else {
$pos--;
break;
}
}
$tagName = strtolower($tagName);
if (empty($tagName) || !isset($this->blockElements[$tagName])) {
// something went wrong => invalid tag
$this->invalidTag();
return false;
}
if ($this->noTagsInCode && end($this->openTags) == 'code' && !($tagName == 'code' && !$isStartTag)) {
// we supress all HTML tags inside code tags
$this->invalidTag();
return false;
}
// get tag attributes
/** TODO: in html 4 attributes do not need to be quoted **/
$isEmptyTag = false;
$attributes = array();
$currAttrib = '';
while (isset($this->html[$pos + 1])) {
$pos++;
// close tag
if ($this->html[$pos] == '>' || $this->html[$pos] . $this->html[$pos + 1] == '/>') {
if ($this->html[$pos] == '/') {
$isEmptyTag = true;
$pos++;
}
break;
}
$pos_ord = ord(strtolower($this->html[$pos]));
if (($pos_ord >= static::$a_ord && $pos_ord <= static::$z_ord) || in_array($pos_ord, static::$special_ords)) {
// attribute name
$currAttrib .= $this->html[$pos];
} elseif (in_array($this->html[$pos], array(' ', "\t", "\n"))) {
// drop whitespace
} elseif (in_array($this->html[$pos] . $this->html[$pos + 1], array('="', "='"))) {
// get attribute value
$pos++;
$await = $this->html[$pos]; // single or double quote
$pos++;
$value = '';
while (isset($this->html[$pos]) && $this->html[$pos] != $await) {
$value .= $this->html[$pos];
$pos++;
}
$attributes[$currAttrib] = $value;
$currAttrib = '';
} else {
$this->invalidTag();
return false;
}
}
if ($this->html[$pos] != '>') {
$this->invalidTag();
return false;
}
if (!empty($currAttrib)) {
// html 4 allows something like <option selected> instead of <option selected="selected">
$attributes[$currAttrib] = $currAttrib;
}
if (!$isStartTag) {
if (!empty($attributes) || $tagName != end($this->openTags)) {
// end tags must not contain any attributes
// or maybe we did not expect a different tag to be closed
$this->invalidTag();
return false;
}
array_pop($this->openTags);
if (in_array($tagName, $this->preformattedTags)) {
$this->keepWhitespace--;
}
}
$pos++;
$this->node = substr($this->html, 0, $pos);
$this->html = substr($this->html, $pos);
$this->tagName = $tagName;
$this->tagAttributes = $attributes;
$this->isStartTag = $isStartTag;
$this->isEmptyTag = $isEmptyTag || in_array($tagName, $this->emptyTags);
if ($this->isEmptyTag) {
// might be not well formed
$this->node = preg_replace('# */? *>$#', ' />', $this->node);
}
$this->nodeType = 'tag';
$this->isBlockElement = $this->blockElements[$tagName];
$this->isNextToInlineContext = $isStartTag && $this->isInlineContext;
$this->isInlineContext = !$this->isBlockElement;
return true;
}
/**
* handle invalid tags
*
* @param void
* @return void
*/
protected function invalidTag()
{
$this->html = substr_replace($this->html, '&lt;', 0, 1);
}
/**
* update all vars and make $this->html shorter
*
* @param string $type see description for $this->nodeType
* @param int $pos to which position shall we cut?
* @return void
*/
protected function setNode($type, $pos)
{
if ($this->nodeType == 'tag') {
// set tag specific vars to null
// $type == tag should not be called here
// see this::parseTag() for more
$this->tagName = null;
$this->tagAttributes = null;
$this->isStartTag = null;
$this->isEmptyTag = null;
$this->isBlockElement = null;
}
$this->nodeType = $type;
$this->node = substr($this->html, 0, $pos);
$this->html = substr($this->html, $pos);
}
/**
* check if $this->html begins with $str
*
* @param string $str
* @return bool
*/
protected function match($str)
{
return substr($this->html, 0, strlen($str)) == $str;
}
/**
* truncate whitespaces
*
* @param void
* @return void
*/
protected function handleWhitespaces()
{
if ($this->keepWhitespace) {
// <pre> or <code> before...
return;
}
// truncate multiple whitespaces to a single one
$this->node = preg_replace('#\s+#s', ' ', $this->node);
}
/**
* normalize self::node
*
* @param void
* @return void
*/
protected function normalizeNode()
{
$this->node = '<';
if (!$this->isStartTag) {
$this->node .= '/' . $this->tagName . '>';
return;
}
$this->node .= $this->tagName;
foreach ($this->tagAttributes as $name => $value) {
$this->node .= ' ' . $name . '="' . str_replace('"', '&quot;', $value) . '"';
}
if ($this->isEmptyTag) {
$this->node .= ' /';
}
$this->node .= '>';
}
}