Textile.php 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578
  1. <?php
  2. /**
  3. * Zend Framework
  4. *
  5. * LICENSE
  6. *
  7. * This source file is subject to the new BSD license that is bundled
  8. * with this package in the file LICENSE.txt.
  9. * It is also available through the world-wide-web at this URL:
  10. * http://framework.zend.com/license/new-bsd
  11. * If you did not receive a copy of the license and are unable to
  12. * obtain it through the world-wide-web, please send an email
  13. * to license@zend.com so we can send you a copy immediately.
  14. *
  15. * @category Zend
  16. * @package Zend_Markup
  17. * @subpackage Parser
  18. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  19. * @license http://framework.zend.com/license/new-bsd New BSD License
  20. * @version $Id$
  21. */
  22. /**
  23. * @see Zend_Markup_TokenList
  24. */
  25. require_once 'Zend/Markup/TokenList.php';
  26. /**
  27. * @see Zend_Markup_Parser_ParserInterface
  28. */
  29. require_once 'Zend/Markup/Parser/ParserInterface.php';
  30. /**
  31. * @category Zend
  32. * @package Zend_Markup
  33. * @subpackage Parser
  34. * @copyright Copyright (c) 2005-2010 Zend Technologies USA Inc. (http://www.zend.com)
  35. * @license http://framework.zend.com/license/new-bsd New BSD License
  36. */
  37. class Zend_Markup_Parser_Textile implements Zend_Markup_Parser_ParserInterface
  38. {
  39. const STATE_SCAN = 0;
  40. const STATE_NEW_PARAGRAPH = 1;
  41. const STATE_NEWLINE = 2;
  42. const MATCH_ATTR_CLASSID = '\((?<attr_class>[a-zA-Z0-9_]+)?(?:\#(?<attr_id>[a-zA-Z0-9_]+))?\)';
  43. const MATCH_ATTR_STYLE = "\{(?<attr_style>[^\}\n]+)\}";
  44. const MATCH_ATTR_LANG = '\[(?<attr_lang>[a-zA-Z_]+)\]';
  45. const MATCH_ATTR_ALIGN = '(?<attr_align>\<\>?|\>|=)';
  46. /**
  47. * Token tree
  48. *
  49. * @var Zend_Markup_TokenList
  50. */
  51. protected $_tree;
  52. /**
  53. * Current token
  54. *
  55. * @var Zend_Markup_Token
  56. */
  57. protected $_current;
  58. /**
  59. * Source to tokenize
  60. *
  61. * @var string
  62. */
  63. protected $_value = '';
  64. /**
  65. * Length of the value
  66. *
  67. * @var int
  68. */
  69. protected $_valueLen = 0;
  70. /**
  71. * Current pointer
  72. *
  73. * @var int
  74. */
  75. protected $_pointer = 0;
  76. /**
  77. * The buffer
  78. *
  79. * @var string
  80. */
  81. protected $_buffer = '';
  82. /**
  83. * Simple tag translation
  84. *
  85. * @var array
  86. */
  87. protected $_simpleTags = array(
  88. '*' => 'strong',
  89. '**' => 'bold',
  90. '_' => 'emphasized',
  91. '__' => 'italic',
  92. '??' => 'citation',
  93. '-' => 'deleted',
  94. '+' => 'insert',
  95. '^' => 'superscript',
  96. '~' => 'subscript',
  97. '%' => 'span',
  98. // these are a little more complicated
  99. '@' => 'code',
  100. '!' => 'img',
  101. );
  102. /**
  103. * Token array
  104. *
  105. * @var array
  106. */
  107. protected $_tokens = array();
  108. /**
  109. * Prepare the parsing of a Textile string, the real parsing is done in {@link _parse()}
  110. *
  111. * @param string $value
  112. *
  113. * @return array
  114. */
  115. public function parse($value)
  116. {
  117. if (!is_string($value)) {
  118. /**
  119. * @see Zend_Markup_Parser_Exception
  120. */
  121. require_once 'Zend/Markup/Parser/Exception.php';
  122. throw new Zend_Markup_Parser_Exception('Value to parse should be a string.');
  123. }
  124. if (empty($value)) {
  125. /**
  126. * @see Zend_Markup_Parser_Exception
  127. */
  128. require_once 'Zend/Markup/Parser/Exception.php';
  129. throw new Zend_Markup_Parser_Exception('Value to parse cannot be left empty.');
  130. }
  131. // first make we only have LF newlines, also trim the value
  132. $this->_value = str_replace(array("\r\n", "\r"), "\n", $value);
  133. $this->_value = trim($this->_value);
  134. // initialize variables and tokenize
  135. $this->_valueLen = iconv_strlen($this->_value, 'UTF-8');
  136. $this->_pointer = 0;
  137. $this->_buffer = '';
  138. $this->_temp = array();
  139. $this->_tokens = array();
  140. $this->_tokenize();
  141. // create the tree
  142. $this->_tree = new Zend_Markup_TokenList();
  143. $this->_current = new Zend_Markup_Token('', Zend_Markup_Token::TYPE_NONE, 'Zend_Markup_Root');
  144. $this->_tree->addChild($this->_current);
  145. $this->_createTree();
  146. return $this->_tree;
  147. }
  148. /**
  149. * Tokenize a textile string
  150. *
  151. * @return array
  152. */
  153. protected function _tokenize()
  154. {
  155. $state = self::STATE_NEW_PARAGRAPH;
  156. $attrsMatch = implode('|', array(
  157. self::MATCH_ATTR_CLASSID,
  158. self::MATCH_ATTR_STYLE,
  159. self::MATCH_ATTR_LANG,
  160. self::MATCH_ATTR_ALIGN
  161. ));
  162. $paragraph = '';
  163. while ($this->_pointer < $this->_valueLen) {
  164. switch ($state) {
  165. case self::STATE_SCAN:
  166. $matches = array(); //[^\n*_?+~%@!-]
  167. $acronym = '(?<acronym>[A-Z]{2,})\((?<title>[^\)]+)\)';
  168. $regex = '#\G(?<text>.*?)(?:'
  169. . "(?:(?<nl_paragraph>\n{2,})|(?<nl_break>\n))|"
  170. . '(?<tag>'
  171. . "(?<name>\*{1,2}|_{1,2}|\?{2}|\-|\+|\~|\^|%|@|!|$|{$acronym}"
  172. . '|":(?<url>[^\s]+)|")'
  173. . "(?:{$attrsMatch})*)"
  174. . ')#si';
  175. preg_match($regex, $this->_value, $matches, null, $this->_pointer);
  176. $this->_pointer += strlen($matches[0]);
  177. if (!empty($matches['text'])) {
  178. $this->_buffer .= $matches['text'];
  179. }
  180. // first add the buffer
  181. if (!empty($this->_buffer)) {
  182. $this->_tokens[] = array(
  183. 'tag' => $this->_buffer,
  184. 'type' => Zend_Markup_Token::TYPE_NONE
  185. );
  186. $this->_buffer = '';
  187. }
  188. if (!empty($matches['nl_paragraph'])) {
  189. $this->_temp = array(
  190. 'tag' => $matches['nl_paragraph'],
  191. 'name' => 'p',
  192. 'type' => Zend_Markup_Token::TYPE_TAG,
  193. 'attributes' => array()
  194. );
  195. $state = self::STATE_NEW_PARAGRAPH;
  196. } elseif (!empty($matches['nl_break'])) {
  197. $this->_tokens[] = array(
  198. 'tag' => $matches['nl_break'],
  199. 'name' => 'break',
  200. 'type' => Zend_Markup_Token::TYPE_TAG,
  201. 'attributes' => array()
  202. );
  203. $state = self::STATE_NEWLINE;
  204. } elseif (!empty($matches['tag'])) {
  205. if (isset($this->_simpleTags[$matches['name']])) {
  206. // now add the new token
  207. $this->_tokens[] = array(
  208. 'tag' => $matches['tag'],
  209. 'type' => Zend_Markup_Token::TYPE_TAG,
  210. 'name' => $this->_simpleTags[$matches['name']],
  211. 'attributes' => $this->_extractAttributes($matches)
  212. );
  213. } else {
  214. $attributes = $this->_extractAttributes($matches);
  215. if ($matches['tag'][0] == '"') {
  216. $name = 'url';
  217. if (isset($matches['url'])) {
  218. $attributes['url'] = $matches['url'];
  219. }
  220. $this->_tokens[] = array(
  221. 'tag' => $matches['tag'],
  222. 'type' => Zend_Markup_Token::TYPE_TAG,
  223. 'name' => $name,
  224. 'attributes' => $attributes
  225. );
  226. } else {
  227. $name = 'acronym';
  228. $this->_tokens[] = array(
  229. 'tag' => '',
  230. 'type' => Zend_Markup_Token::TYPE_TAG,
  231. 'name' => 'acronym',
  232. 'attributes' => array(
  233. 'title' => $matches['title']
  234. )
  235. );
  236. $this->_tokens[] = array(
  237. 'tag' => $matches['acronym'],
  238. 'type' => Zend_Markup_Token::TYPE_NONE
  239. );
  240. $this->_tokens[] = array(
  241. 'tag' => '(' . $matches['title'] . ')',
  242. 'type' => Zend_Markup_Token::TYPE_TAG,
  243. 'name' => 'acronym',
  244. 'attributes' => array()
  245. );
  246. }
  247. }
  248. $state = self::STATE_SCAN;
  249. }
  250. break;
  251. case self::STATE_NEW_PARAGRAPH:
  252. if (empty($this->_temp)) {
  253. $this->_temp = array(
  254. 'tag' => '',
  255. 'name' => 'p',
  256. 'type' => Zend_Markup_token::TYPE_TAG,
  257. 'attributes' => array()
  258. );
  259. } else {
  260. $this->_tokens[] = array(
  261. 'tag' => "\n",
  262. 'name' => 'p',
  263. 'type' => Zend_Markup_Token::TYPE_TAG,
  264. 'attributes' => array()
  265. );
  266. $this->_temp['tag'] = substr($this->_temp['tag'], 1);
  267. }
  268. $matches = array(); //[^\n*_?+~%@!-] (\()? [^()]+ (?(1)\))
  269. $regex = "#\G(?<name>(h[1-6]|p)|(?:\#|\*))(?:{$attrsMatch})*(?(2)\.\s|\s)#i";
  270. if (!preg_match($regex, $this->_value, $matches, null, $this->_pointer)) {
  271. $this->_tokens[] = $this->_temp;
  272. $state = self::STATE_SCAN;
  273. break;
  274. }
  275. $this->_pointer += strlen($matches[0]);
  276. if ($matches['name'] == 'p') {
  277. $this->_temp['tag'] .= $matches[0];
  278. $this->_temp['attributes'] = $this->_extractAttributes($matches);
  279. $this->_tokens[] = $this->_temp;
  280. $this->_temp = array();
  281. } else {
  282. $this->_tokens[] = $this->_temp;
  283. $this->_temp = array();
  284. $name = $matches['name'];
  285. $attributes = $this->_extractAttributes($matches);
  286. if ($name == '#') {
  287. $name = 'list';
  288. $attributes['list'] = 'decimal';
  289. } elseif ($name == '*') {
  290. $name = 'list';
  291. }
  292. $this->_tokens[] = array(
  293. 'tag' => $matches[0],
  294. 'name' => $name,
  295. 'type' => Zend_Markup_Token::TYPE_TAG,
  296. 'attributes' => $attributes
  297. );
  298. }
  299. $state = self::STATE_SCAN;
  300. break;
  301. case self::STATE_NEWLINE:
  302. $matches = array(); //[^\n*_?+~%@!-]
  303. $regex = "#\G(?<name>(h[1-6])|(?:\#|\*))(?:{$attrsMatch})*(?(2)\.\s|\s)#si";
  304. if (!preg_match($regex, $this->_value, $matches, null, $this->_pointer)) {
  305. $state = self::STATE_SCAN;
  306. break;
  307. }
  308. $this->_pointer += strlen($matches[0]);
  309. $name = $matches['name'];
  310. $attributes = $this->_extractAttributes($matches);
  311. if ($name == '#') {
  312. $name = 'list';
  313. $attributes['list'] = 'decimal';
  314. } elseif ($name == '*') {
  315. $name = 'list';
  316. }
  317. $this->_tokens[] = array(
  318. 'tag' => $matches[0],
  319. 'name' => $name,
  320. 'type' => Zend_Markup_Token::TYPE_TAG,
  321. 'attributes' => $attributes
  322. );
  323. break;
  324. }
  325. }
  326. if (!empty($buffer)) {
  327. $this->_tokens[] = array(
  328. 'tag' => $buffer,
  329. 'type' => Zend_Markup_Token::TYPE_NONE
  330. );
  331. $buffer = '';
  332. }
  333. }
  334. /**
  335. * Create a tree from the tokenized text
  336. *
  337. * @return void
  338. */
  339. protected function _createTree()
  340. {
  341. $inside = true;
  342. foreach ($this->_tokens as $key => $token) {
  343. // first check if the token is a stopper
  344. if ($this->_isStopper($token, $this->_current)) {
  345. if ($this->_current->getName() == 'li') {
  346. // list items are handled differently
  347. if (isset($this->_tokens[$key + 1])
  348. && ($this->_tokens[$key + 1]['type'] == Zend_Markup_Token::TYPE_TAG)
  349. && ($this->_tokens[$key + 1]['name'] == 'list')
  350. ) {
  351. // the next item is a correct tag
  352. $this->_current->setStopper($token['tag']);
  353. $this->_current = $this->_current->getParent();
  354. } else {
  355. // close the list
  356. $this->_current->setStopper($token['tag']);
  357. $this->_current = $this->_current->getParent()->getParent();
  358. // go up in the tree until we found the end
  359. while ($this->_isStopper($token, $this->_current)) {
  360. $this->_current->setStopper($token['tag']);
  361. $this->_current = $this->_current->getParent();
  362. }
  363. }
  364. } else {
  365. // go up in the tree until we found the end of stoppers
  366. while ($this->_isStopper($token, $this->_current)) {
  367. $this->_current->setStopper($token['tag']);
  368. if (!empty($token['attributes'])) {
  369. foreach ($token['attributes'] as $name => $value) {
  370. $this->_current->addAttribute($name, $value);
  371. }
  372. }
  373. $this->_current = $this->_current->getParent();
  374. }
  375. }
  376. $inside = true;
  377. } elseif (($token['type'] == Zend_Markup_Token::TYPE_TAG) && $inside) {
  378. if ($token['name'] == 'break') {
  379. // add the newline and continue parsing
  380. $this->_current->addChild(new Zend_Markup_Token(
  381. $token['tag'],
  382. Zend_Markup_Token::TYPE_NONE,
  383. '',
  384. array(),
  385. $this->_current
  386. ));
  387. } else {
  388. // handle a list item
  389. if ($token['name'] == 'list') {
  390. $attributes = array();
  391. if (isset($token['attributes']['list'])) {
  392. $attributes['list'] = $token['attributes']['list'];
  393. unset($token['attributes']['list']);
  394. }
  395. if ($this->_current->getName() != 'list') {
  396. // the list isn't started yet, create it
  397. $child = new Zend_Markup_Token(
  398. '',
  399. Zend_Markup_Token::TYPE_TAG,
  400. 'list',
  401. $attributes,
  402. $this->_current
  403. );
  404. $this->_current->addChild($child);
  405. $this->_current = $child;
  406. }
  407. $token['name'] = 'li';
  408. } elseif (($token['name'] == 'img') || ($token['name'] == 'url')) {
  409. $inside = false;
  410. }
  411. // add the token
  412. $child = new Zend_Markup_Token(
  413. $token['tag'],
  414. Zend_Markup_Token::TYPE_TAG,
  415. $token['name'],
  416. $token['attributes'],
  417. $this->_current
  418. );
  419. $this->_current->addChild($child);
  420. $this->_current = $child;
  421. }
  422. } else {
  423. // simply add the token as text
  424. $this->_current->addChild(new Zend_Markup_Token(
  425. $token['tag'],
  426. Zend_Markup_Token::TYPE_NONE,
  427. '',
  428. array(),
  429. $this->_current
  430. ));
  431. }
  432. }
  433. }
  434. /**
  435. * Check if a tag is a stopper
  436. *
  437. * @param array $token
  438. * @param Zend_Markup_Token $current
  439. *
  440. * @return bool
  441. */
  442. protected function _isStopper(array $token, Zend_Markup_Token $current)
  443. {
  444. switch ($current->getName()) {
  445. case 'h1':
  446. case 'h2':
  447. case 'h3':
  448. case 'h4':
  449. case 'h5':
  450. case 'h6':
  451. case 'list':
  452. case 'li':
  453. if (($token['type'] == Zend_Markup_Token::TYPE_TAG)
  454. && (($token['name'] == 'break') || ($token['name'] == 'p'))
  455. ) {
  456. return true;
  457. }
  458. break;
  459. case 'break':
  460. return false;
  461. break;
  462. default:
  463. if (($token['type'] == Zend_Markup_Token::TYPE_TAG) && ($token['name'] == $current->getName())) {
  464. return true;
  465. }
  466. break;
  467. }
  468. return false;
  469. }
  470. /**
  471. * Extract the attributes
  472. *
  473. * @param array $matches
  474. *
  475. * @return array
  476. */
  477. protected function _extractAttributes(array $matches)
  478. {
  479. $attributes = array();
  480. if (!empty($matches['attr_class'])) {
  481. $attributes['class'] = $matches['attr_class'];
  482. }
  483. if (!empty($matches['attr_id'])) {
  484. $attributes['id'] = $matches['attr_id'];
  485. }
  486. if (!empty($matches['attr_style'])) {
  487. $attributes['style'] = $matches['attr_style'];
  488. }
  489. if (!empty($matches['attr_lang'])) {
  490. $attributes['lang'] = $matches['attr_lang'];
  491. }
  492. if (!empty($matches['attr_align'])) {
  493. switch ($matches['attr_align']) {
  494. case '=':
  495. $attributes['align'] = 'center';
  496. break;
  497. case '>':
  498. $attributes['align'] = 'right';
  499. break;
  500. case '<>':
  501. $attributes['align'] = 'justify';
  502. break;
  503. default:
  504. case '<':
  505. $attributes['align'] = 'left';
  506. break;
  507. }
  508. }
  509. return $attributes;
  510. }
  511. }