|
|
<?php
namespace App\Parser\HtmlParser;
use DOMDocument; use Illuminate\Support\Facades\Log;
class ParseHtml {
public function fromUploadedFile($file) { try { $htmlDom = new DomDocument(); Log::info('Parse html from file:'.$file); $htmlString = file_get_contents($file); libxml_use_internal_errors(true); $htmlDom->loadHTML($htmlString); $htmlDom->preserveWhiteSpace = false;
return $this->parseLoadedHtml($htmlDom); } catch (\Exception $exception) { dd($exception); } }
private function parseLoadedHtml($htmlDom) { $response = []; $page = $htmlDom->getElementsByTagName("body")[ 0 ]; $dataStructuredArray = $this->buildTheParsedResponse($this->domToArray($page)); foreach ($dataStructuredArray as $index => $item) { if (isset($item[ '_type' ]) && $item[ '_type' ] !== 'table') { $data = $this->handleChildrens($item); if (isset($data[ 'content' ])) {
$data[ 'content' ] = $this->closetags($data[ 'content' ]); $data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ", strip_tags($data[ 'content' ])); $response[] = $data; } }
}
return $this->fixChildrenStructure($response); }
private function domToArray($root) { $result = [];
//handle classic node
if ($root->nodeType == XML_ELEMENT_NODE) { $result[ '_type' ] = $root->nodeName; if ($root->nodeName === 'ol') { if ($root->hasAttribute('start')) { $result[ '_startFrom' ] = $root->getAttribute('start'); } else { $result[ '_startFrom' ] = 1; } } $result[ '_numberOfChildren' ] = $root->childNodes->length; if ($root->hasChildNodes()) { $children = $root->childNodes; for ($i = 0; $i < $children->length; $i++) { $child = $this->domToArray($children->item($i));
//don't keep textnode with only spaces and newline
if (! empty($child)) { $result[ '_children' ][] = $child; } } }
//handle text node
} elseif ($root->nodeType == XML_TEXT_NODE || $root->nodeType == XML_CDATA_SECTION_NODE) { $value = $root->nodeValue; if (! empty($value)) { $cleanText = preg_replace("/(\r\n|\t|\r|\n)+/", " ", $value); if (! empty(str_replace(' ', '', $cleanText))) { $result[ '_type' ] = '_text'; $result[ '_content' ] = ltrim($cleanText); }
} }
//list attributes
if ($root->hasAttributes()) { foreach ($root->attributes as $attribute) { $result[ '_attributes' ][ $attribute->name ] = $attribute->value; } }
return $result; }
private function buildTheParsedResponse(array $htmElementsAsArray): array { $parsedResponse = []; foreach ($htmElementsAsArray[ '_children' ] as $index => $elementArray) { $data = []; if ($elementArray[ '_type' ] === '_text') { $data[ '_type' ] = $elementArray[ '_type' ]; $data[ 'content' ] = $this->parseParagraph($elementArray); } elseif (isset($elementArray[ '_children' ])) {
$parsedResponseData = $this->buildTheParsedResponse($elementArray); if (! empty($parsedResponseData)) { $data[ '_type' ] = $elementArray[ '_type' ]; if (in_array($elementArray[ '_type' ], ['ul', 'ol'])) { if (isset($elementArray[ '_startFrom' ])) { $data[ 'start' ] = $elementArray[ '_startFrom' ]; } $data [ 'children' ] = $parsedResponseData; } else {
$data [ 'content' ] = $parsedResponseData; }
}
} if (! empty($data)) { if (isset($elementArray[ '_attributes' ])) { $data[ '_attributes' ] = $elementArray[ '_attributes' ]; } $parsedResponse[] = $data; } }
return $parsedResponse; }
private function remove_empty_tags_recursive($str, $repto = null) { //** Return if string not given or empty.
if (! is_string($str) || trim($str) == '') { return $str; }
//** Recursive empty HTML tags.
return preg_replace(
//** Pattern written by Junaid Atari.
'/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU',
//** Replace with nothing if string empty.
! is_string($repto) ? '' : $repto,
//** Source string
$str); }
private function closetags($text) { $tagstack = []; $stacksize = 0; $tagqueue = ''; $newtext = ''; // Known single-entity/self-closing tags.
$single_tags = [ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source' ]; // Tags that can be immediately nested within themselves.
$nestable_tags = ['blockquote', 'div', 'object', 'q', 'span'];
// WP bug fix for comments - in case you REALLY meant to type '< !--'.
$text = str_replace('< !--', '< !--', $text); // WP bug fix for LOVE <3 (and other situations with '<' before a number).
$text = preg_replace('#<([0-9]{1})#', '<$1', $text);
/** * Matches supported tags. * * To get the pattern as a string without the comments paste into a PHP * REPL like `php -a`. * * @see https://html.spec.whatwg.org/#elements-2
* @see https://w3c.github.io/webcomponents/spec/custom/#valid-custom-element-name
* * @example * ~# php -a
* php > $s = [paste copied contents of expression below including parentheses]; * php > echo $s; */ $tag_pattern = ('#<'. // Start with an opening bracket.
'(/?)'. // Group 1 - If it's a closing tag it'll have a leading slash.
'('. // Group 2 - Tag name.
// Custom element tags have more lenient rules than HTML tag names.
'(?:[a-z](?:[a-z0-9._]*)-(?:[a-z0-9._-]+)+)'.'|'.// Traditional tag rules approximate HTML tag names. '(?:[\w:]+)'.')'.'(?:'.// We either immediately close the tag with its '>' and have nothing here. '\s*'.'(/?)'. // Group 3 - "attributes" for empty tag.
'|'.// Or we must start with space characters to separate the tag name from the attributes (or whitespace). '(\s+)'. // Group 4 - Pre-attribute whitespace.
'([^>]*)'. // Group 5 - Attributes.
')'.'>#' // End with a closing bracket.
);
while (preg_match($tag_pattern, $text, $regex)) { $full_match = $regex[ 0 ]; $has_leading_slash = ! empty($regex[ 1 ]); $tag_name = $regex[ 2 ]; $tag = strtolower($tag_name); $is_single_tag = in_array($tag, $single_tags, true); $pre_attribute_ws = isset($regex[ 4 ]) ? $regex[ 4 ] : ''; $attributes = trim(isset($regex[ 5 ]) ? $regex[ 5 ] : $regex[ 3 ]); $has_self_closer = '/' === substr($attributes, -1);
$newtext .= $tagqueue;
$i = strpos($text, $full_match); $l = strlen($full_match);
// Clear the shifter.
$tagqueue = ''; if ($has_leading_slash) { // End tag.
// If too many closing tags.
if ($stacksize <= 0) { $tag = ''; // Or close to be safe $tag = '/' . $tag.
// If stacktop value = tag close value, then pop.
} elseif ($tagstack[ $stacksize - 1 ] === $tag) { // Found closing tag.
$tag = '</'.$tag.'>'; // Close tag.
array_pop($tagstack); $stacksize--; } else { // Closing tag not at top, search for it.
for ($j = $stacksize - 1; $j >= 0; $j--) { if ($tagstack[ $j ] === $tag) { // Add tag to tagqueue.
for ($k = $stacksize - 1; $k >= $j; $k--) { $tagqueue .= '</'.array_pop($tagstack).'>'; $stacksize--; } break; } } $tag = ''; } } else { // Begin tag.
if ($has_self_closer) { // If it presents itself as a self-closing tag...
// ...but it isn't a known single-entity self-closing tag, then don't let it be treated as such
// and immediately close it with a closing tag (the tag will encapsulate no text as a result).
if (! $is_single_tag) { $attributes = trim(substr($attributes, 0, -1))."></$tag"; } } elseif ($is_single_tag) { // Else if it's a known single-entity tag but it doesn't close itself, do so.
$pre_attribute_ws = ' '; $attributes .= '/'; } else { // It's not a single-entity tag.
// If the top of the stack is the same as the tag we want to push, close previous tag.
if ($stacksize > 0 && ! in_array($tag, $nestable_tags, true) && $tagstack[ $stacksize - 1 ] === $tag) { $tagqueue = '</'.array_pop($tagstack).'>'; $stacksize--; } $stacksize = array_push($tagstack, $tag); }
// Attributes.
if ($has_self_closer && $is_single_tag) { // We need some space - avoid <br/> and prefer <br />.
$pre_attribute_ws = ' '; }
$tag = '<'.$tag.$pre_attribute_ws.$attributes.'>'; // If already queuing a close tag, then put this tag on too.
if (! empty($tagqueue)) { $tagqueue .= $tag; $tag = ''; } } $newtext .= substr($text, 0, $i).$tag; $text = substr($text, $i + $l); }
// Clear tag queue.
$newtext .= $tagqueue;
// Add remaining text.
$newtext .= $text;
while ($x = array_pop($tagstack)) { $newtext .= '</'.$x.'>'; // Add remaining tags to close.
}
// WP fix for the bug with HTML comments.
$newtext = str_replace('< !--', '<!--', $newtext); $newtext = str_replace('< !--', '< !--', $newtext);
return $this->remove_empty_tags_recursive($newtext); }
private function parseParagraph($elementArray, $type = null, $number = null) { $data = [];
$data[ '_content' ] = ($type) ? $this->closetags(implode('', $type).$elementArray[ '_content' ]) : $elementArray[ '_content' ];
return $data; }
private function handleChildrens($data, $parsed = []) { if ($data[ '_type' ] !== 'table') {
$parsed[ 'content' ] = '<'.$data[ '_type' ].'>'; if (in_array($data[ '_type' ], ['ol', 'ul'])) { $parsed[ 'children' ] = []; if (isset($data[ 'start' ])) { $startFrom = $data[ 'start' ]; }
foreach ($data[ 'children' ] as $child) { if (isset($child[ 'start' ])) { $startFrom = $child[ 'start' ]; } if (isset($child[ 'content' ])) { foreach ($child[ 'content' ] as $li) { $data = $this->handleChildrens($li); if (isset($data[ 'content' ])) { $data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ", strip_tags($data[ 'content' ])); if (isset($startFrom) && strlen(trim($data[ 'clean_content' ])) > 0) { $data[ 'numbering_row' ] = $startFrom; $startFrom++; }
$parsed[ 'children' ][] = $data; }
} } else { $data = $this->handleChildrens($child); $data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ", strip_tags($data[ 'content' ])); $parsed[ 'children' ][] = $data; } } } elseif (isset($data[ '_type' ]) && ($data[ '_type' ] === 'div')) { foreach ($data[ 'content' ] as $child) { $data = $this->handleChildrens($child); if (isset($data[ 'content' ])) { $data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ", strip_tags($data[ 'content' ])); $data[ 'content' ] = $this->closetags($data[ 'content' ]); }
$parsed[ 'children' ][] = $data; }
} else { $contentChilds = count($data[ 'content' ]); foreach ($data[ 'content' ] as $index => $child) { if ($child[ '_type' ] !== '_text') { if (! isset($parsed[ 'content' ])) { $parsed[ 'content' ] = '<'.$child[ '_type' ].'>'; } else { $parsed[ 'content' ] .= '<'.$child[ '_type' ].'>';
} $childs = $this->handleChildrens($child, $parsed); if ($childs && isset($child[ 'content' ])) { $parsed[ 'content' ] .= $childs[ 'content' ]; }
} else { if (! isset($parsed[ 'content' ])) { $parsed[ 'content' ] = $child[ 'content' ][ '_content' ]; } else { $parsed[ 'content' ] .= $child[ 'content' ][ '_content' ]; } } if ($contentChilds == $index + 1) { $parsed[ 'content' ] = $this->closetags($parsed[ 'content' ]); } $parsed[ 'children' ] = []; } }
return $parsed; }
}
private function fixChildrenStructure($data) {
$result = []; $alreadyHandledIndexes = [];
for ($i = 0; $i < count($data); $i++) {
if (isset($data[ $i ][ 'content' ]) && $data[ $i ][ 'content' ] == '<ol>') { $alreadyHandledIndexes[] = $i; continue; } if (array_key_exists($i, $alreadyHandledIndexes)) { continue; }
if(isset($data[ $i ]['content']) && $data[ $i ]['content']==='' && count($data[ $i ]['children'])==1){ $data[ $i ] = last($data[ $i ]['children']); }
$j = $i + 1; for ($j; $j < count($data); $j++) { if (array_key_exists($i, $alreadyHandledIndexes)) { continue; }
if (! isset($data[ $j ][ 'content' ]) || strpos($data[ $j ][ 'content' ], 'h1') !== false) { break; }
if(isset($data[$i]['numbering_row'])){ $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandledIndexes[] = $j;
}else { break; }
}
//if (isset($data[ $i ][ 'content' ]) && empty($data[ $i ][ 'content' ])) {
// $data[ $i ] = last($data[ $i ][ 'children' ]);
//}
if (is_array($data[ $i ]) && count($data[ $i ]) > 1 && ! isset($data[ $i ][ 'content' ])) { $result = array_merge($result, $data[ $i ]); } else { $result[] = $data[ $i ]; }
$alreadyHandledIndexes[] = $i;
}
return $result; }
private function handlePossibleChild($parent, $child = []) {
if($child['content']===''){ dd($parent); }
if (isset($parent[ 'children' ])) { if (empty($parent[ 'content' ]) && count($parent[ 'children' ]) === 1) {
$parent = $parent[ 'children' ][ 0 ]; } elseif (empty($parent[ 'content' ]) && count($parent[ 'children' ]) > 1) { $parent = $this->fixChildrenStructure($parent[ 'children' ]); }
}
if (isset($child[ 'content' ]) && $child[ 'content' ] == '<ol>') { for ($i = 0; $i < count($child[ 'children' ]); $i++) { $newChild = $child[ 'children' ][ $i ]; if ($child[ 'children' ][ $i ][ 'content' ] == '<ol>') { $lastParentChild = last($parent[ 'children' ]);
$newChild = $this->handlePossibleChild($lastParentChild, $child[ 'children' ][ $i ]); }
$parent[ 'children' ][] = $newChild;
}
//return $parent;
}
if (isset($parent[ 'clean_content' ]) && strlen($parent[ 'clean_content' ]) && strpbrk(substr($parent[ 'clean_content' ], -1), '.,;\'"0123456789') === false && ctype_lower(substr($parent[ 'clean_content' ], -1)) && isset($child[ 'clean_content' ]) && strlen($child[ 'clean_content' ])) { $parent[ 'content' ] .= ' '.$child[ 'content' ]; $parent[ 'children' ] = array_merge($parent[ 'children' ], $child[ 'children' ]); $parent[ 'clean_content' ] .= ' '.$child[ 'clean_content' ];
}
if (is_array($parent) && count($parent) == 1 && ! isset($parent[ 'content' ])) { $parent = array_shift($parent); }
return $parent; }
}
|