You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
527 lines
19 KiB
527 lines
19 KiB
<?php
|
|
|
|
namespace App\Parser\HtmlParser;
|
|
|
|
use DOMDocument;
|
|
use Illuminate\Support\Facades\Log;
|
|
|
|
class ParseHtml
|
|
{
|
|
|
|
public function fromUploadedFile($file)
|
|
{
|
|
try {
|
|
$htmlDom = new DomDocument();
|
|
Log::info('Parse html from file:'.$file);
|
|
$htmlString = file_get_contents($file);
|
|
libxml_use_internal_errors(true);
|
|
$htmlDom->loadHTML($htmlString);
|
|
$htmlDom->preserveWhiteSpace = false;
|
|
|
|
return $this->parseLoadedHtml($htmlDom);
|
|
} catch (\Exception $exception) {
|
|
dd($exception);
|
|
}
|
|
}
|
|
|
|
|
|
private function parseLoadedHtml($htmlDom)
|
|
{
|
|
$response = [];
|
|
$page = $htmlDom->getElementsByTagName("body")[ 0 ];
|
|
$dataStructuredArray = $this->buildTheParsedResponse($this->domToArray($page));
|
|
foreach ($dataStructuredArray as $index => $item) {
|
|
if (isset($item[ '_type' ]) && $item[ '_type' ] !== 'table') {
|
|
$data = $this->handleChildrens($item);
|
|
if (isset($data[ 'content' ])) {
|
|
|
|
$data[ 'content' ] = $this->closetags($data[ 'content' ]);
|
|
$data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ", strip_tags($data[ 'content' ]));
|
|
$response[] = $data;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
return $this->fixChildrenStructure($response);
|
|
}
|
|
|
|
|
|
private function domToArray($root)
|
|
{
|
|
$result = [];
|
|
|
|
//handle classic node
|
|
if ($root->nodeType == XML_ELEMENT_NODE) {
|
|
$result[ '_type' ] = $root->nodeName;
|
|
if ($root->nodeName === 'ol') {
|
|
if ($root->hasAttribute('start')) {
|
|
$result[ '_startFrom' ] = $root->getAttribute('start');
|
|
} else {
|
|
$result[ '_startFrom' ] = 1;
|
|
}
|
|
}
|
|
$result[ '_numberOfChildren' ] = $root->childNodes->length;
|
|
if ($root->hasChildNodes()) {
|
|
$children = $root->childNodes;
|
|
for ($i = 0; $i < $children->length; $i++) {
|
|
$child = $this->domToArray($children->item($i));
|
|
|
|
//don't keep textnode with only spaces and newline
|
|
if (! empty($child)) {
|
|
$result[ '_children' ][] = $child;
|
|
}
|
|
}
|
|
}
|
|
|
|
//handle text node
|
|
} elseif ($root->nodeType == XML_TEXT_NODE || $root->nodeType == XML_CDATA_SECTION_NODE) {
|
|
$value = $root->nodeValue;
|
|
if (! empty($value)) {
|
|
$cleanText = preg_replace("/(\r\n|\t|\r|\n)+/", " ", $value);
|
|
if (! empty(str_replace(' ', '', $cleanText))) {
|
|
$result[ '_type' ] = '_text';
|
|
$result[ '_content' ] = ltrim($cleanText);
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
//list attributes
|
|
if ($root->hasAttributes()) {
|
|
foreach ($root->attributes as $attribute) {
|
|
$result[ '_attributes' ][ $attribute->name ] = $attribute->value;
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
|
|
private function buildTheParsedResponse(array $htmElementsAsArray): array
|
|
{
|
|
$parsedResponse = [];
|
|
foreach ($htmElementsAsArray[ '_children' ] as $index => $elementArray) {
|
|
$data = [];
|
|
if ($elementArray[ '_type' ] === '_text') {
|
|
$data[ '_type' ] = $elementArray[ '_type' ];
|
|
$data[ 'content' ] = $this->parseParagraph($elementArray);
|
|
} elseif (isset($elementArray[ '_children' ])) {
|
|
|
|
|
|
$parsedResponseData = $this->buildTheParsedResponse($elementArray);
|
|
if (! empty($parsedResponseData)) {
|
|
$data[ '_type' ] = $elementArray[ '_type' ];
|
|
if (in_array($elementArray[ '_type' ], ['ul', 'ol'])) {
|
|
if (isset($elementArray[ '_startFrom' ])) {
|
|
$data[ 'start' ] = $elementArray[ '_startFrom' ];
|
|
}
|
|
$data [ 'children' ] = $parsedResponseData;
|
|
} else {
|
|
|
|
$data [ 'content' ] = $parsedResponseData;
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
if (! empty($data)) {
|
|
if (isset($elementArray[ '_attributes' ])) {
|
|
$data[ '_attributes' ] = $elementArray[ '_attributes' ];
|
|
}
|
|
$parsedResponse[] = $data;
|
|
}
|
|
}
|
|
|
|
return $parsedResponse;
|
|
}
|
|
|
|
|
|
private function remove_empty_tags_recursive($str, $repto = null)
|
|
{
|
|
//** Return if string not given or empty.
|
|
if (! is_string($str) || trim($str) == '') {
|
|
return $str;
|
|
}
|
|
|
|
//** Recursive empty HTML tags.
|
|
return preg_replace(
|
|
|
|
//** Pattern written by Junaid Atari.
|
|
'/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU',
|
|
|
|
//** Replace with nothing if string empty.
|
|
! is_string($repto) ? '' : $repto,
|
|
|
|
//** Source string
|
|
$str);
|
|
}
|
|
|
|
|
|
private function closetags($text)
|
|
{
|
|
$tagstack = [];
|
|
$stacksize = 0;
|
|
$tagqueue = '';
|
|
$newtext = '';
|
|
// Known single-entity/self-closing tags.
|
|
$single_tags = [
|
|
'area',
|
|
'base',
|
|
'basefont',
|
|
'br',
|
|
'col',
|
|
'command',
|
|
'embed',
|
|
'frame',
|
|
'hr',
|
|
'img',
|
|
'input',
|
|
'isindex',
|
|
'link',
|
|
'meta',
|
|
'param',
|
|
'source'
|
|
];
|
|
// Tags that can be immediately nested within themselves.
|
|
$nestable_tags = ['blockquote', 'div', 'object', 'q', 'span'];
|
|
|
|
// WP bug fix for comments - in case you REALLY meant to type '< !--'.
|
|
$text = str_replace('< !--', '< !--', $text);
|
|
// WP bug fix for LOVE <3 (and other situations with '<' before a number).
|
|
$text = preg_replace('#<([0-9]{1})#', '<$1', $text);
|
|
|
|
/**
|
|
* Matches supported tags.
|
|
*
|
|
* To get the pattern as a string without the comments paste into a PHP
|
|
* REPL like `php -a`.
|
|
*
|
|
* @see https://html.spec.whatwg.org/#elements-2
|
|
* @see https://w3c.github.io/webcomponents/spec/custom/#valid-custom-element-name
|
|
*
|
|
* @example
|
|
* ~# php -a
|
|
* php > $s = [paste copied contents of expression below including parentheses];
|
|
* php > echo $s;
|
|
*/
|
|
$tag_pattern = ('#<'. // Start with an opening bracket.
|
|
'(/?)'. // Group 1 - If it's a closing tag it'll have a leading slash.
|
|
'('. // Group 2 - Tag name.
|
|
// Custom element tags have more lenient rules than HTML tag names.
|
|
'(?:[a-z](?:[a-z0-9._]*)-(?:[a-z0-9._-]+)+)'.'|'.// Traditional tag rules approximate HTML tag names.
|
|
'(?:[\w:]+)'.')'.'(?:'.// We either immediately close the tag with its '>' and have nothing here.
|
|
'\s*'.'(/?)'. // Group 3 - "attributes" for empty tag.
|
|
'|'.// Or we must start with space characters to separate the tag name from the attributes (or whitespace).
|
|
'(\s+)'. // Group 4 - Pre-attribute whitespace.
|
|
'([^>]*)'. // Group 5 - Attributes.
|
|
')'.'>#' // End with a closing bracket.
|
|
);
|
|
|
|
while (preg_match($tag_pattern, $text, $regex)) {
|
|
$full_match = $regex[ 0 ];
|
|
$has_leading_slash = ! empty($regex[ 1 ]);
|
|
$tag_name = $regex[ 2 ];
|
|
$tag = strtolower($tag_name);
|
|
$is_single_tag = in_array($tag, $single_tags, true);
|
|
$pre_attribute_ws = isset($regex[ 4 ]) ? $regex[ 4 ] : '';
|
|
$attributes = trim(isset($regex[ 5 ]) ? $regex[ 5 ] : $regex[ 3 ]);
|
|
$has_self_closer = '/' === substr($attributes, -1);
|
|
|
|
$newtext .= $tagqueue;
|
|
|
|
$i = strpos($text, $full_match);
|
|
$l = strlen($full_match);
|
|
|
|
// Clear the shifter.
|
|
$tagqueue = '';
|
|
if ($has_leading_slash) { // End tag.
|
|
// If too many closing tags.
|
|
if ($stacksize <= 0) {
|
|
$tag = '';
|
|
// Or close to be safe $tag = '/' . $tag.
|
|
|
|
// If stacktop value = tag close value, then pop.
|
|
} elseif ($tagstack[ $stacksize - 1 ] === $tag) { // Found closing tag.
|
|
$tag = '</'.$tag.'>'; // Close tag.
|
|
array_pop($tagstack);
|
|
$stacksize--;
|
|
} else { // Closing tag not at top, search for it.
|
|
for ($j = $stacksize - 1; $j >= 0; $j--) {
|
|
if ($tagstack[ $j ] === $tag) {
|
|
// Add tag to tagqueue.
|
|
for ($k = $stacksize - 1; $k >= $j; $k--) {
|
|
$tagqueue .= '</'.array_pop($tagstack).'>';
|
|
$stacksize--;
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
$tag = '';
|
|
}
|
|
} else { // Begin tag.
|
|
if ($has_self_closer) { // If it presents itself as a self-closing tag...
|
|
// ...but it isn't a known single-entity self-closing tag, then don't let it be treated as such
|
|
// and immediately close it with a closing tag (the tag will encapsulate no text as a result).
|
|
if (! $is_single_tag) {
|
|
$attributes = trim(substr($attributes, 0, -1))."></$tag";
|
|
}
|
|
} elseif ($is_single_tag) { // Else if it's a known single-entity tag but it doesn't close itself, do so.
|
|
$pre_attribute_ws = ' ';
|
|
$attributes .= '/';
|
|
} else { // It's not a single-entity tag.
|
|
// If the top of the stack is the same as the tag we want to push, close previous tag.
|
|
if ($stacksize > 0 && ! in_array($tag, $nestable_tags,
|
|
true) && $tagstack[ $stacksize - 1 ] === $tag) {
|
|
$tagqueue = '</'.array_pop($tagstack).'>';
|
|
$stacksize--;
|
|
}
|
|
$stacksize = array_push($tagstack, $tag);
|
|
}
|
|
|
|
// Attributes.
|
|
if ($has_self_closer && $is_single_tag) {
|
|
// We need some space - avoid <br/> and prefer <br />.
|
|
$pre_attribute_ws = ' ';
|
|
}
|
|
|
|
$tag = '<'.$tag.$pre_attribute_ws.$attributes.'>';
|
|
// If already queuing a close tag, then put this tag on too.
|
|
if (! empty($tagqueue)) {
|
|
$tagqueue .= $tag;
|
|
$tag = '';
|
|
}
|
|
}
|
|
$newtext .= substr($text, 0, $i).$tag;
|
|
$text = substr($text, $i + $l);
|
|
}
|
|
|
|
// Clear tag queue.
|
|
$newtext .= $tagqueue;
|
|
|
|
// Add remaining text.
|
|
$newtext .= $text;
|
|
|
|
while ($x = array_pop($tagstack)) {
|
|
$newtext .= '</'.$x.'>'; // Add remaining tags to close.
|
|
}
|
|
|
|
// WP fix for the bug with HTML comments.
|
|
$newtext = str_replace('< !--', '<!--', $newtext);
|
|
$newtext = str_replace('< !--', '< !--', $newtext);
|
|
|
|
return $this->remove_empty_tags_recursive($newtext);
|
|
}
|
|
|
|
|
|
private function parseParagraph($elementArray, $type = null, $number = null)
|
|
{
|
|
$data = [];
|
|
|
|
$data[ '_content' ] = ($type) ? $this->closetags(implode('',
|
|
$type).$elementArray[ '_content' ]) : $elementArray[ '_content' ];
|
|
|
|
return $data;
|
|
}
|
|
|
|
|
|
private function handleChildrens($data, $parsed = [])
|
|
{
|
|
if ($data[ '_type' ] !== 'table') {
|
|
|
|
$parsed[ 'content' ] = '<'.$data[ '_type' ].'>';
|
|
if (in_array($data[ '_type' ], ['ol', 'ul'])) {
|
|
$parsed[ 'children' ] = [];
|
|
if (isset($data[ 'start' ])) {
|
|
$startFrom = $data[ 'start' ];
|
|
}
|
|
|
|
foreach ($data[ 'children' ] as $child) {
|
|
if (isset($child[ 'start' ])) {
|
|
$startFrom = $child[ 'start' ];
|
|
}
|
|
if (isset($child[ 'content' ])) {
|
|
foreach ($child[ 'content' ] as $li) {
|
|
$data = $this->handleChildrens($li);
|
|
if (isset($data[ 'content' ])) {
|
|
$data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ",
|
|
strip_tags($data[ 'content' ]));
|
|
if (isset($startFrom) && strlen(trim($data[ 'clean_content' ])) > 0) {
|
|
$data[ 'numbering_row' ] = $startFrom;
|
|
$startFrom++;
|
|
}
|
|
|
|
$parsed[ 'children' ][] = $data;
|
|
}
|
|
|
|
}
|
|
} else {
|
|
$data = $this->handleChildrens($child);
|
|
$data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ",
|
|
strip_tags($data[ 'content' ]));
|
|
$parsed[ 'children' ][] = $data;
|
|
}
|
|
}
|
|
} elseif (isset($data[ '_type' ]) && ($data[ '_type' ] === 'div')) {
|
|
foreach ($data[ 'content' ] as $child) {
|
|
$data = $this->handleChildrens($child);
|
|
if (isset($data[ 'content' ])) {
|
|
$data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ",
|
|
strip_tags($data[ 'content' ]));
|
|
$data[ 'content' ] = $this->closetags($data[ 'content' ]);
|
|
}
|
|
|
|
$parsed[ 'children' ][] = $data;
|
|
}
|
|
|
|
} else {
|
|
$contentChilds = count($data[ 'content' ]);
|
|
foreach ($data[ 'content' ] as $index => $child) {
|
|
if ($child[ '_type' ] !== '_text') {
|
|
if (! isset($parsed[ 'content' ])) {
|
|
$parsed[ 'content' ] = '<'.$child[ '_type' ].'>';
|
|
} else {
|
|
$parsed[ 'content' ] .= '<'.$child[ '_type' ].'>';
|
|
|
|
}
|
|
$childs = $this->handleChildrens($child, $parsed);
|
|
if ($childs && isset($child[ 'content' ])) {
|
|
$parsed[ 'content' ] .= $childs[ 'content' ];
|
|
}
|
|
|
|
} else {
|
|
if (! isset($parsed[ 'content' ])) {
|
|
$parsed[ 'content' ] = $child[ 'content' ][ '_content' ];
|
|
} else {
|
|
$parsed[ 'content' ] .= $child[ 'content' ][ '_content' ];
|
|
}
|
|
}
|
|
if ($contentChilds == $index + 1) {
|
|
$parsed[ 'content' ] = $this->closetags($parsed[ 'content' ]);
|
|
}
|
|
$parsed[ 'children' ] = [];
|
|
}
|
|
}
|
|
|
|
return $parsed;
|
|
}
|
|
|
|
}
|
|
|
|
|
|
private function fixChildrenStructure($data)
|
|
{
|
|
|
|
$result = [];
|
|
$alreadyHandledIndexes = [];
|
|
|
|
for ($i = 0; $i < count($data); $i++) {
|
|
|
|
if (isset($data[ $i ][ 'content' ]) && $data[ $i ][ 'content' ] == '<ol>') {
|
|
$alreadyHandledIndexes[] = $i;
|
|
continue;
|
|
}
|
|
if (array_key_exists($i, $alreadyHandledIndexes)) {
|
|
continue;
|
|
}
|
|
|
|
if(isset($data[ $i ]['content']) && $data[ $i ]['content']==='' && count($data[ $i ]['children'])==1){
|
|
$data[ $i ] = last($data[ $i ]['children']);
|
|
}
|
|
|
|
|
|
$j = $i + 1;
|
|
for ($j; $j < count($data); $j++) {
|
|
if (array_key_exists($i, $alreadyHandledIndexes)) {
|
|
continue;
|
|
}
|
|
|
|
if (! isset($data[ $j ][ 'content' ]) || strpos($data[ $j ][ 'content' ], 'h1') !== false) {
|
|
break;
|
|
}
|
|
|
|
if(isset($data[$i]['numbering_row'])){
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandledIndexes[] = $j;
|
|
|
|
}else {
|
|
break;
|
|
}
|
|
|
|
|
|
}
|
|
|
|
//if (isset($data[ $i ][ 'content' ]) && empty($data[ $i ][ 'content' ])) {
|
|
// $data[ $i ] = last($data[ $i ][ 'children' ]);
|
|
//}
|
|
if (is_array($data[ $i ]) && count($data[ $i ]) > 1 && ! isset($data[ $i ][ 'content' ])) {
|
|
$result = array_merge($result, $data[ $i ]);
|
|
} else {
|
|
$result[] = $data[ $i ];
|
|
}
|
|
|
|
$alreadyHandledIndexes[] = $i;
|
|
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
|
|
private function handlePossibleChild($parent, $child = [])
|
|
{
|
|
|
|
if($child['content']===''){
|
|
dd($parent);
|
|
}
|
|
|
|
|
|
|
|
if (isset($parent[ 'children' ])) {
|
|
if (empty($parent[ 'content' ]) && count($parent[ 'children' ]) === 1) {
|
|
|
|
$parent = $parent[ 'children' ][ 0 ];
|
|
} elseif (empty($parent[ 'content' ]) && count($parent[ 'children' ]) > 1) {
|
|
$parent = $this->fixChildrenStructure($parent[ 'children' ]);
|
|
}
|
|
|
|
|
|
}
|
|
|
|
if (isset($child[ 'content' ]) && $child[ 'content' ] == '<ol>') {
|
|
for ($i = 0; $i < count($child[ 'children' ]); $i++) {
|
|
$newChild = $child[ 'children' ][ $i ];
|
|
if ($child[ 'children' ][ $i ][ 'content' ] == '<ol>') {
|
|
$lastParentChild = last($parent[ 'children' ]);
|
|
|
|
$newChild = $this->handlePossibleChild($lastParentChild, $child[ 'children' ][ $i ]);
|
|
}
|
|
|
|
$parent[ 'children' ][] = $newChild;
|
|
|
|
}
|
|
|
|
//return $parent;
|
|
|
|
}
|
|
|
|
|
|
if (isset($parent[ 'clean_content' ]) && strlen($parent[ 'clean_content' ]) && strpbrk(substr($parent[ 'clean_content' ],
|
|
-1), '.,;\'"0123456789') === false && ctype_lower(substr($parent[ 'clean_content' ],
|
|
-1)) && isset($child[ 'clean_content' ]) && strlen($child[ 'clean_content' ])) {
|
|
$parent[ 'content' ] .= ' '.$child[ 'content' ];
|
|
$parent[ 'children' ] = array_merge($parent[ 'children' ], $child[ 'children' ]);
|
|
$parent[ 'clean_content' ] .= ' '.$child[ 'clean_content' ];
|
|
|
|
}
|
|
|
|
if (is_array($parent) && count($parent) == 1 && ! isset($parent[ 'content' ])) {
|
|
$parent = array_shift($parent);
|
|
}
|
|
|
|
|
|
|
|
return $parent;
|
|
}
|
|
|
|
}
|