loadHTML($htmlString); $htmlDom->preserveWhiteSpace = false; return $this->parseLoadedHtml($htmlDom); } catch (\Exception $exception) { dd($exception); } } private function parseLoadedHtml($htmlDom) { $response = []; $page = $htmlDom->getElementsByTagName("body")[ 0 ]; $dataStructuredArray = $this->buildTheParsedResponse($this->domToArray($page)); foreach ($dataStructuredArray as $index => $item) { if (isset($item[ '_type' ]) && $item[ '_type' ] !== 'table') { $data = $this->handleChildrens($item); if (isset($data[ 'content' ])) { $data[ 'content' ] = $this->closetags($data[ 'content' ]); $data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ", strip_tags($data[ 'content' ])); $response[] = $data; } } } return $this->fixChildrenStructure($response); } private function domToArray($root) { $result = []; //handle classic node if ($root->nodeType == XML_ELEMENT_NODE) { $result[ '_type' ] = $root->nodeName; if ($root->nodeName === 'ol') { if ($root->hasAttribute('start')) { $result[ '_startFrom' ] = $root->getAttribute('start'); } else { $result[ '_startFrom' ] = 1; } } $result[ '_numberOfChildren' ] = $root->childNodes->length; if ($root->hasChildNodes()) { $children = $root->childNodes; for ($i = 0; $i < $children->length; $i++) { $child = $this->domToArray($children->item($i)); //don't keep textnode with only spaces and newline if (! empty($child)) { $result[ '_children' ][] = $child; } } } //handle text node } elseif ($root->nodeType == XML_TEXT_NODE || $root->nodeType == XML_CDATA_SECTION_NODE) { $value = $root->nodeValue; if (! empty($value)) { $cleanText = preg_replace("/(\r\n|\t|\r|\n)+/", " ", $value); if (! empty(str_replace(' ', '', $cleanText))) { $result[ '_type' ] = '_text'; $result[ '_content' ] = ltrim($cleanText); } } } //list attributes if ($root->hasAttributes()) { foreach ($root->attributes as $attribute) { $result[ '_attributes' ][ $attribute->name ] = $attribute->value; } } return $result; } private function buildTheParsedResponse(array $htmElementsAsArray): array { $parsedResponse = []; foreach ($htmElementsAsArray[ '_children' ] as $index => $elementArray) { $data = []; if ($elementArray[ '_type' ] === '_text') { $data[ '_type' ] = $elementArray[ '_type' ]; $data[ 'content' ] = $this->parseParagraph($elementArray); } elseif (isset($elementArray[ '_children' ])) { $parsedResponseData = $this->buildTheParsedResponse($elementArray); if (! empty($parsedResponseData)) { $data[ '_type' ] = $elementArray[ '_type' ]; if (in_array($elementArray[ '_type' ], ['ul', 'ol'])) { if (isset($elementArray[ '_startFrom' ])) { $data[ 'start' ] = $elementArray[ '_startFrom' ]; } $data [ 'children' ] = $parsedResponseData; } else { $data [ 'content' ] = $parsedResponseData; } } } if (! empty($data)) { if (isset($elementArray[ '_attributes' ])) { $data[ '_attributes' ] = $elementArray[ '_attributes' ]; } $parsedResponse[] = $data; } } return $parsedResponse; } private function remove_empty_tags_recursive($str, $repto = null) { //** Return if string not given or empty. if (! is_string($str) || trim($str) == '') { return $str; } //** Recursive empty HTML tags. return preg_replace( //** Pattern written by Junaid Atari. '/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', //** Replace with nothing if string empty. ! is_string($repto) ? '' : $repto, //** Source string $str); } private function closetags($text) { $tagstack = []; $stacksize = 0; $tagqueue = ''; $newtext = ''; // Known single-entity/self-closing tags. $single_tags = [ 'area', 'base', 'basefont', 'br', 'col', 'command', 'embed', 'frame', 'hr', 'img', 'input', 'isindex', 'link', 'meta', 'param', 'source' ]; // Tags that can be immediately nested within themselves. $nestable_tags = ['blockquote', 'div', 'object', 'q', 'span']; // WP bug fix for comments - in case you REALLY meant to type '< !--'. $text = str_replace('< !--', '< !--', $text); // WP bug fix for LOVE <3 (and other situations with '<' before a number). $text = preg_replace('#<([0-9]{1})#', '<$1', $text); /** * Matches supported tags. * * To get the pattern as a string without the comments paste into a PHP * REPL like `php -a`. * * @see https://html.spec.whatwg.org/#elements-2 * @see https://w3c.github.io/webcomponents/spec/custom/#valid-custom-element-name * * @example * ~# php -a * php > $s = [paste copied contents of expression below including parentheses]; * php > echo $s; */ $tag_pattern = ('#<'. // Start with an opening bracket. '(/?)'. // Group 1 - If it's a closing tag it'll have a leading slash. '('. // Group 2 - Tag name. // Custom element tags have more lenient rules than HTML tag names. '(?:[a-z](?:[a-z0-9._]*)-(?:[a-z0-9._-]+)+)'.'|'.// Traditional tag rules approximate HTML tag names. '(?:[\w:]+)'.')'.'(?:'.// We either immediately close the tag with its '>' and have nothing here. '\s*'.'(/?)'. // Group 3 - "attributes" for empty tag. '|'.// Or we must start with space characters to separate the tag name from the attributes (or whitespace). '(\s+)'. // Group 4 - Pre-attribute whitespace. '([^>]*)'. // Group 5 - Attributes. ')'.'>#' // End with a closing bracket. ); while (preg_match($tag_pattern, $text, $regex)) { $full_match = $regex[ 0 ]; $has_leading_slash = ! empty($regex[ 1 ]); $tag_name = $regex[ 2 ]; $tag = strtolower($tag_name); $is_single_tag = in_array($tag, $single_tags, true); $pre_attribute_ws = isset($regex[ 4 ]) ? $regex[ 4 ] : ''; $attributes = trim(isset($regex[ 5 ]) ? $regex[ 5 ] : $regex[ 3 ]); $has_self_closer = '/' === substr($attributes, -1); $newtext .= $tagqueue; $i = strpos($text, $full_match); $l = strlen($full_match); // Clear the shifter. $tagqueue = ''; if ($has_leading_slash) { // End tag. // If too many closing tags. if ($stacksize <= 0) { $tag = ''; // Or close to be safe $tag = '/' . $tag. // If stacktop value = tag close value, then pop. } elseif ($tagstack[ $stacksize - 1 ] === $tag) { // Found closing tag. $tag = ''; // Close tag. array_pop($tagstack); $stacksize--; } else { // Closing tag not at top, search for it. for ($j = $stacksize - 1; $j >= 0; $j--) { if ($tagstack[ $j ] === $tag) { // Add tag to tagqueue. for ($k = $stacksize - 1; $k >= $j; $k--) { $tagqueue .= ''; $stacksize--; } break; } } $tag = ''; } } else { // Begin tag. if ($has_self_closer) { // If it presents itself as a self-closing tag... // ...but it isn't a known single-entity self-closing tag, then don't let it be treated as such // and immediately close it with a closing tag (the tag will encapsulate no text as a result). if (! $is_single_tag) { $attributes = trim(substr($attributes, 0, -1))."> 0 && ! in_array($tag, $nestable_tags, true) && $tagstack[ $stacksize - 1 ] === $tag) { $tagqueue = ''; $stacksize--; } $stacksize = array_push($tagstack, $tag); } // Attributes. if ($has_self_closer && $is_single_tag) { // We need some space - avoid
and prefer
. $pre_attribute_ws = ' '; } $tag = '<'.$tag.$pre_attribute_ws.$attributes.'>'; // If already queuing a close tag, then put this tag on too. if (! empty($tagqueue)) { $tagqueue .= $tag; $tag = ''; } } $newtext .= substr($text, 0, $i).$tag; $text = substr($text, $i + $l); } // Clear tag queue. $newtext .= $tagqueue; // Add remaining text. $newtext .= $text; while ($x = array_pop($tagstack)) { $newtext .= ''; // Add remaining tags to close. } // WP fix for the bug with HTML comments. $newtext = str_replace('< !--', '