Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

527 lines
19 KiB

<?php
namespace App\Parser\HtmlParser;
use DOMDocument;
use Illuminate\Support\Facades\Log;
class ParseHtml
{
public function fromUploadedFile($file)
{
try {
$htmlDom = new DomDocument();
Log::info('Parse html from file:'.$file);
$htmlString = file_get_contents($file);
libxml_use_internal_errors(true);
$htmlDom->loadHTML($htmlString);
$htmlDom->preserveWhiteSpace = false;
return $this->parseLoadedHtml($htmlDom);
} catch (\Exception $exception) {
dd($exception);
}
}
private function parseLoadedHtml($htmlDom)
{
$response = [];
$page = $htmlDom->getElementsByTagName("body")[ 0 ];
$dataStructuredArray = $this->buildTheParsedResponse($this->domToArray($page));
foreach ($dataStructuredArray as $index => $item) {
if (isset($item[ '_type' ]) && $item[ '_type' ] !== 'table') {
$data = $this->handleChildrens($item);
if (isset($data[ 'content' ])) {
$data[ 'content' ] = $this->closetags($data[ 'content' ]);
$data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ", strip_tags($data[ 'content' ]));
$response[] = $data;
}
}
}
return $this->fixChildrenStructure($response);
}
private function domToArray($root)
{
$result = [];
//handle classic node
if ($root->nodeType == XML_ELEMENT_NODE) {
$result[ '_type' ] = $root->nodeName;
if ($root->nodeName === 'ol') {
if ($root->hasAttribute('start')) {
$result[ '_startFrom' ] = $root->getAttribute('start');
} else {
$result[ '_startFrom' ] = 1;
}
}
$result[ '_numberOfChildren' ] = $root->childNodes->length;
if ($root->hasChildNodes()) {
$children = $root->childNodes;
for ($i = 0; $i < $children->length; $i++) {
$child = $this->domToArray($children->item($i));
//don't keep textnode with only spaces and newline
if (! empty($child)) {
$result[ '_children' ][] = $child;
}
}
}
//handle text node
} elseif ($root->nodeType == XML_TEXT_NODE || $root->nodeType == XML_CDATA_SECTION_NODE) {
$value = $root->nodeValue;
if (! empty($value)) {
$cleanText = preg_replace("/(\r\n|\t|\r|\n)+/", " ", $value);
if (! empty(str_replace(' ', '', $cleanText))) {
$result[ '_type' ] = '_text';
$result[ '_content' ] = ltrim($cleanText);
}
}
}
//list attributes
if ($root->hasAttributes()) {
foreach ($root->attributes as $attribute) {
$result[ '_attributes' ][ $attribute->name ] = $attribute->value;
}
}
return $result;
}
private function buildTheParsedResponse(array $htmElementsAsArray): array
{
$parsedResponse = [];
foreach ($htmElementsAsArray[ '_children' ] as $index => $elementArray) {
$data = [];
if ($elementArray[ '_type' ] === '_text') {
$data[ '_type' ] = $elementArray[ '_type' ];
$data[ 'content' ] = $this->parseParagraph($elementArray);
} elseif (isset($elementArray[ '_children' ])) {
$parsedResponseData = $this->buildTheParsedResponse($elementArray);
if (! empty($parsedResponseData)) {
$data[ '_type' ] = $elementArray[ '_type' ];
if (in_array($elementArray[ '_type' ], ['ul', 'ol'])) {
if (isset($elementArray[ '_startFrom' ])) {
$data[ 'start' ] = $elementArray[ '_startFrom' ];
}
$data [ 'children' ] = $parsedResponseData;
} else {
$data [ 'content' ] = $parsedResponseData;
}
}
}
if (! empty($data)) {
if (isset($elementArray[ '_attributes' ])) {
$data[ '_attributes' ] = $elementArray[ '_attributes' ];
}
$parsedResponse[] = $data;
}
}
return $parsedResponse;
}
private function remove_empty_tags_recursive($str, $repto = null)
{
//** Return if string not given or empty.
if (! is_string($str) || trim($str) == '') {
return $str;
}
//** Recursive empty HTML tags.
return preg_replace(
//** Pattern written by Junaid Atari.
'/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU',
//** Replace with nothing if string empty.
! is_string($repto) ? '' : $repto,
//** Source string
$str);
}
private function closetags($text)
{
$tagstack = [];
$stacksize = 0;
$tagqueue = '';
$newtext = '';
// Known single-entity/self-closing tags.
$single_tags = [
'area',
'base',
'basefont',
'br',
'col',
'command',
'embed',
'frame',
'hr',
'img',
'input',
'isindex',
'link',
'meta',
'param',
'source'
];
// Tags that can be immediately nested within themselves.
$nestable_tags = ['blockquote', 'div', 'object', 'q', 'span'];
// WP bug fix for comments - in case you REALLY meant to type '< !--'.
$text = str_replace('< !--', '< !--', $text);
// WP bug fix for LOVE <3 (and other situations with '<' before a number).
$text = preg_replace('#<([0-9]{1})#', '&lt;$1', $text);
/**
* Matches supported tags.
*
* To get the pattern as a string without the comments paste into a PHP
* REPL like `php -a`.
*
* @see https://html.spec.whatwg.org/#elements-2
* @see https://w3c.github.io/webcomponents/spec/custom/#valid-custom-element-name
*
* @example
* ~# php -a
* php > $s = [paste copied contents of expression below including parentheses];
* php > echo $s;
*/
$tag_pattern = ('#<'. // Start with an opening bracket.
'(/?)'. // Group 1 - If it's a closing tag it'll have a leading slash.
'('. // Group 2 - Tag name.
// Custom element tags have more lenient rules than HTML tag names.
'(?:[a-z](?:[a-z0-9._]*)-(?:[a-z0-9._-]+)+)'.'|'.// Traditional tag rules approximate HTML tag names.
'(?:[\w:]+)'.')'.'(?:'.// We either immediately close the tag with its '>' and have nothing here.
'\s*'.'(/?)'. // Group 3 - "attributes" for empty tag.
'|'.// Or we must start with space characters to separate the tag name from the attributes (or whitespace).
'(\s+)'. // Group 4 - Pre-attribute whitespace.
'([^>]*)'. // Group 5 - Attributes.
')'.'>#' // End with a closing bracket.
);
while (preg_match($tag_pattern, $text, $regex)) {
$full_match = $regex[ 0 ];
$has_leading_slash = ! empty($regex[ 1 ]);
$tag_name = $regex[ 2 ];
$tag = strtolower($tag_name);
$is_single_tag = in_array($tag, $single_tags, true);
$pre_attribute_ws = isset($regex[ 4 ]) ? $regex[ 4 ] : '';
$attributes = trim(isset($regex[ 5 ]) ? $regex[ 5 ] : $regex[ 3 ]);
$has_self_closer = '/' === substr($attributes, -1);
$newtext .= $tagqueue;
$i = strpos($text, $full_match);
$l = strlen($full_match);
// Clear the shifter.
$tagqueue = '';
if ($has_leading_slash) { // End tag.
// If too many closing tags.
if ($stacksize <= 0) {
$tag = '';
// Or close to be safe $tag = '/' . $tag.
// If stacktop value = tag close value, then pop.
} elseif ($tagstack[ $stacksize - 1 ] === $tag) { // Found closing tag.
$tag = '</'.$tag.'>'; // Close tag.
array_pop($tagstack);
$stacksize--;
} else { // Closing tag not at top, search for it.
for ($j = $stacksize - 1; $j >= 0; $j--) {
if ($tagstack[ $j ] === $tag) {
// Add tag to tagqueue.
for ($k = $stacksize - 1; $k >= $j; $k--) {
$tagqueue .= '</'.array_pop($tagstack).'>';
$stacksize--;
}
break;
}
}
$tag = '';
}
} else { // Begin tag.
if ($has_self_closer) { // If it presents itself as a self-closing tag...
// ...but it isn't a known single-entity self-closing tag, then don't let it be treated as such
// and immediately close it with a closing tag (the tag will encapsulate no text as a result).
if (! $is_single_tag) {
$attributes = trim(substr($attributes, 0, -1))."></$tag";
}
} elseif ($is_single_tag) { // Else if it's a known single-entity tag but it doesn't close itself, do so.
$pre_attribute_ws = ' ';
$attributes .= '/';
} else { // It's not a single-entity tag.
// If the top of the stack is the same as the tag we want to push, close previous tag.
if ($stacksize > 0 && ! in_array($tag, $nestable_tags,
true) && $tagstack[ $stacksize - 1 ] === $tag) {
$tagqueue = '</'.array_pop($tagstack).'>';
$stacksize--;
}
$stacksize = array_push($tagstack, $tag);
}
// Attributes.
if ($has_self_closer && $is_single_tag) {
// We need some space - avoid <br/> and prefer <br />.
$pre_attribute_ws = ' ';
}
$tag = '<'.$tag.$pre_attribute_ws.$attributes.'>';
// If already queuing a close tag, then put this tag on too.
if (! empty($tagqueue)) {
$tagqueue .= $tag;
$tag = '';
}
}
$newtext .= substr($text, 0, $i).$tag;
$text = substr($text, $i + $l);
}
// Clear tag queue.
$newtext .= $tagqueue;
// Add remaining text.
$newtext .= $text;
while ($x = array_pop($tagstack)) {
$newtext .= '</'.$x.'>'; // Add remaining tags to close.
}
// WP fix for the bug with HTML comments.
$newtext = str_replace('< !--', '<!--', $newtext);
$newtext = str_replace('< !--', '< !--', $newtext);
return $this->remove_empty_tags_recursive($newtext);
}
private function parseParagraph($elementArray, $type = null, $number = null)
{
$data = [];
$data[ '_content' ] = ($type) ? $this->closetags(implode('',
$type).$elementArray[ '_content' ]) : $elementArray[ '_content' ];
return $data;
}
private function handleChildrens($data, $parsed = [])
{
if ($data[ '_type' ] !== 'table') {
$parsed[ 'content' ] = '<'.$data[ '_type' ].'>';
if (in_array($data[ '_type' ], ['ol', 'ul'])) {
$parsed[ 'children' ] = [];
if (isset($data[ 'start' ])) {
$startFrom = $data[ 'start' ];
}
foreach ($data[ 'children' ] as $child) {
if (isset($child[ 'start' ])) {
$startFrom = $child[ 'start' ];
}
if (isset($child[ 'content' ])) {
foreach ($child[ 'content' ] as $li) {
$data = $this->handleChildrens($li);
if (isset($data[ 'content' ])) {
$data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ",
strip_tags($data[ 'content' ]));
if (isset($startFrom) && strlen(trim($data[ 'clean_content' ])) > 0) {
$data[ 'numbering_row' ] = $startFrom;
$startFrom++;
}
$parsed[ 'children' ][] = $data;
}
}
} else {
$data = $this->handleChildrens($child);
$data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ",
strip_tags($data[ 'content' ]));
$parsed[ 'children' ][] = $data;
}
}
} elseif (isset($data[ '_type' ]) && ($data[ '_type' ] === 'div')) {
foreach ($data[ 'content' ] as $child) {
$data = $this->handleChildrens($child);
if (isset($data[ 'content' ])) {
$data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ",
strip_tags($data[ 'content' ]));
$data[ 'content' ] = $this->closetags($data[ 'content' ]);
}
$parsed[ 'children' ][] = $data;
}
} else {
$contentChilds = count($data[ 'content' ]);
foreach ($data[ 'content' ] as $index => $child) {
if ($child[ '_type' ] !== '_text') {
if (! isset($parsed[ 'content' ])) {
$parsed[ 'content' ] = '<'.$child[ '_type' ].'>';
} else {
$parsed[ 'content' ] .= '<'.$child[ '_type' ].'>';
}
$childs = $this->handleChildrens($child, $parsed);
if ($childs && isset($child[ 'content' ])) {
$parsed[ 'content' ] .= $childs[ 'content' ];
}
} else {
if (! isset($parsed[ 'content' ])) {
$parsed[ 'content' ] = $child[ 'content' ][ '_content' ];
} else {
$parsed[ 'content' ] .= $child[ 'content' ][ '_content' ];
}
}
if ($contentChilds == $index + 1) {
$parsed[ 'content' ] = $this->closetags($parsed[ 'content' ]);
}
$parsed[ 'children' ] = [];
}
}
return $parsed;
}
}
private function fixChildrenStructure($data)
{
$result = [];
$alreadyHandledIndexes = [];
for ($i = 0; $i < count($data); $i++) {
if (isset($data[ $i ][ 'content' ]) && $data[ $i ][ 'content' ] == '<ol>') {
$alreadyHandledIndexes[] = $i;
continue;
}
if (array_key_exists($i, $alreadyHandledIndexes)) {
continue;
}
if(isset($data[ $i ]['content']) && $data[ $i ]['content']==='' && count($data[ $i ]['children'])==1){
$data[ $i ] = last($data[ $i ]['children']);
}
$j = $i + 1;
for ($j; $j < count($data); $j++) {
if (array_key_exists($i, $alreadyHandledIndexes)) {
continue;
}
if (! isset($data[ $j ][ 'content' ]) || strpos($data[ $j ][ 'content' ], 'h1') !== false) {
break;
}
if(isset($data[$i]['numbering_row'])){
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandledIndexes[] = $j;
}else {
break;
}
}
//if (isset($data[ $i ][ 'content' ]) && empty($data[ $i ][ 'content' ])) {
// $data[ $i ] = last($data[ $i ][ 'children' ]);
//}
if (is_array($data[ $i ]) && count($data[ $i ]) > 1 && ! isset($data[ $i ][ 'content' ])) {
$result = array_merge($result, $data[ $i ]);
} else {
$result[] = $data[ $i ];
}
$alreadyHandledIndexes[] = $i;
}
return $result;
}
private function handlePossibleChild($parent, $child = [])
{
if($child['content']===''){
dd($parent);
}
if (isset($parent[ 'children' ])) {
if (empty($parent[ 'content' ]) && count($parent[ 'children' ]) === 1) {
$parent = $parent[ 'children' ][ 0 ];
} elseif (empty($parent[ 'content' ]) && count($parent[ 'children' ]) > 1) {
$parent = $this->fixChildrenStructure($parent[ 'children' ]);
}
}
if (isset($child[ 'content' ]) && $child[ 'content' ] == '<ol>') {
for ($i = 0; $i < count($child[ 'children' ]); $i++) {
$newChild = $child[ 'children' ][ $i ];
if ($child[ 'children' ][ $i ][ 'content' ] == '<ol>') {
$lastParentChild = last($parent[ 'children' ]);
$newChild = $this->handlePossibleChild($lastParentChild, $child[ 'children' ][ $i ]);
}
$parent[ 'children' ][] = $newChild;
}
//return $parent;
}
if (isset($parent[ 'clean_content' ]) && strlen($parent[ 'clean_content' ]) && strpbrk(substr($parent[ 'clean_content' ],
-1), '.,;\'"0123456789') === false && ctype_lower(substr($parent[ 'clean_content' ],
-1)) && isset($child[ 'clean_content' ]) && strlen($child[ 'clean_content' ])) {
$parent[ 'content' ] .= ' '.$child[ 'content' ];
$parent[ 'children' ] = array_merge($parent[ 'children' ], $child[ 'children' ]);
$parent[ 'clean_content' ] .= ' '.$child[ 'clean_content' ];
}
if (is_array($parent) && count($parent) == 1 && ! isset($parent[ 'content' ])) {
$parent = array_shift($parent);
}
return $parent;
}
}