|
|
<?php
namespace App\Parser;
use Illuminate\Support\Facades\Log; use Illuminate\Support\Facades\Storage; use SimpleXMLElement;
class ParseXml {
/** * @var int */ private $titleFontThreshold;
/** * @var int */ private $headerFontFooterThreshold;
/** * ParseXml constructor. */ public function __construct() { $this->headerFontFooterThreshold = null; $this->titleFontThreshold = null; }
/** * Handle xml files * * @param $xmlFile * * @return mixed */ public function handle($xmlFile) { if (is_string($xmlFile)) { try { $storageDisk = Storage::disk('contracts'); while (! $storageDisk->exists($xmlFile)) { //Sleep if file not yet written
sleep(1); } $file = $storageDisk->get($xmlFile); } catch (\Exception $exception) { Log::error('Failed to load the xml file '.$exception->getMessage()); } } else { $file = file_get_contents($xmlFile); } //foreach (simplexml_load_string($file) as $key =>$xmlElementPage){
// dd($xmlElementPage);
//}
return $this->buildChildStructure($this->handleElements(simplexml_load_string($file)->xpath('//text')));
}
/** * @param $element * * @return mixed */ private function handleElements($element) { if (is_array($element)) { $elements = $element; } else { $elements = (array) $element; } //dd(!in_array(trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))),['and','or']),trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))));
$this->setTitleThreshold($elements); $numberOfNodes = count($elements); $rows = []; for ($i = 0; $i < $numberOfNodes; $i++) { $current = $elements[ $i ]; $listContent = []; if ($current instanceof SimpleXMLElement) { $content = $this->getNodeContent($current); //if(strpos($content,'Provided that the Customer has continued to pay ')!==false){
// dd(($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) && (((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
// || (isset($elements[ $i + 1 ]) && ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))), substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))));
//}
$parentNumbering = [];
while ($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) && (((((((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ]) || (ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))) || (! in_array(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0, 1), [','])) || (ctype_lower(substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1)))) && ! in_array(substr(trim(str_replace(['and','or'], '', $content)), strlen(trim(str_replace(['and', 'or'], '', $content))) - 1),['!', '.', '?', ';', '_', ':', ')']) && ! preg_match('/^.*?\-[^\d]*(\d+)[^\d]*\-.*$/',$content) && (substr(trim($this->getNodeContent($elements[ $i + 1 ])), 0,strlen('<b>')) !== '<b>' && ctype_lower((substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1))))) || ((int) $elements[ $i ][ 'top' ] === (int) $elements[ $i + 1 ][ 'top' ])) || (isset($elements[ $i + 1 ]) && trim(strip_tags($this->getNodeContent($elements[ $i+1])))=='[') ) { //if($parentNumbering){
// dd($parentNumbering,$content);
//}
preg_match('/^([-+]?\d*\.?\d+)(?:[-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '', substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '', preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))), 0, 5)), $childNumbering); if (! $childNumbering) { preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '', substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '', preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))), 0, 5)), $parentNumbering); } //if($childNumbering && strpos($childNumbering[0],"2.1.5")!==false){
// dd(11,$content,$elements[$i],$i,$i+1);
//}
$nextElement = $elements[ $i + 1 ]; $nextElementContent = $this->getNodeContent($nextElement); $content .= ' '.$nextElementContent; $current[ 'top' ] = $nextElement[ 'top' ]; $current[ 'height' ] = $nextElement[ 'height' ];
if (count($parentNumbering)) { $current[ 'row_numbering' ] = $parentNumbering[ 0 ]; $content = str_replace($current[ 'row_numbering' ], '', $content); $i++;
break;
} elseif ($childNumbering) { $current[ 'row_numbering' ] = $childNumbering[ 0 ]; $content = str_replace($current[ 'row_numbering' ], '', $content); if (strlen(trim(strip_tags($content))) && ! in_array(substr(trim(strip_tags($content)), strlen(trim(strip_tags($content))) - 1), ['.', ':', '!', '?','[',',']) && !ctype_lower(substr(trim(strip_tags($content)), strlen(trim(strip_tags($content)))-1)) && (!ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))), 0, 1)) || !in_array(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))), 0, 1), ['[', '{']))) { $i++;
break; }
} if( ! empty($current[ 'row_numbering' ]) && ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/", "", strip_tags($this->getNodeContent($elements[$i])))))){
$i++; break; } //$current[ 'font' ] = $nextElement[ 'font' ];
$i++;
continue; }
$data = $this->extractNumbering($content);
$content = [ 'type' => (int) $current[ 'font' ] === $this->titleFontThreshold ? 'title' : null, 'content' => $data[ 'content' ], 'numbering' => (! empty($current[ 'row_numbering' ])) ? (int)$current[ 'row_numbering' ] : $data[ 'numbering' ], 'top' => (int) $current[ 'top' ], 'height' => (int) $current[ 'height' ], 'left' => (int) $current[ 'left' ], 'font' => (int) $current[ 'font' ], 'children' => $listContent ];
$rows[] = $content; }
}
return $rows; }
/** * Returns the xml node content * * @param $node * * @return string|string[]|null */ private function getNodeContent($node) {
return preg_replace('!\s+!', ' ', preg_match_all("/<text.*?>(.*?)<\/text>/", $node->asXML(), $matches) ? $matches[ 1 ] ? $matches[ 1 ][ 0 ] : '' : '');
}
/** * Extract the numbering if exists from the string * * @param $content * * @return array */ private function extractNumbering($content) { $regexOne = '/^(([a-zA-Z0-9]+[.\)])+)([ ]|[a-z]|[A-Z])/'; $regexTwo = '/^(([\d\.]+)\d)/';
if (preg_match($regexOne, $content, $n)) {
$numbering = trim(last($n)); } else { if (preg_match($regexTwo, $content, $n)) { $numbering = trim(last($n)); } else { $numbering = ''; } } if (strlen($numbering) > 1) { return [ 'content' => '<p>'.trim(str_replace($numbering, '', $content)).'</p>', 'numbering' => $numbering ]; } return [ 'content' => '<p>'.trim($content).'</p>', 'numbering' => '' ]; }
/** * Build the structure as required by the editor and the gamification module * * @param $elements * * @return array */ private function buildChildStructure($elements) { $alreadyHandledIndexes = []; $build = [];
// 0 1 2 3 4 5 6
// 1 1.1 1.1.1 1.2 1.2.1 1.3 1.3.1 2 3 4 4.1 4.2 5 6
for ($i = 0; $i < count($elements) - 1; $i++) { if (! isset($elements[ $i ][ 'type' ])) { if ($elements[ $i ][ 'top' ] < 100) { $elements[ $i ][ 'type' ] = 'header'; } elseif ($elements[ $i ][ 'top' ] > 1150) { $elements[ $i ][ 'type' ] = 'footer'; } } if (in_array($i, $alreadyHandledIndexes)) { continue; } if (isset($elements[ $i ][ 'type' ]) && in_array($elements[ $i ][ 'type' ], ['footer', 'header'])) { continue; }
for ($j = $i + 1; $j < count($elements); $j++) {
if (! isset($elements[ $j ][ 'type' ])) { if ($elements[ $j ][ 'top' ] < 100) { $elements[ $j ][ 'type' ] = 'header'; } elseif ($elements[ $j ][ 'top' ] > 1150) { $elements[ $j ][ 'type' ] = 'footer'; } } if (in_array($j, $alreadyHandledIndexes)) { continue; } if (isset($elements[ $j ][ 'type' ]) && in_array($elements[ $j ][ 'type' ], ['footer', 'header'])) { continue; } if ($elements[ $j ][ 'type' ] === 'title' && $elements[ $i ][ 'top' ] !== $elements[ $j ][ 'top' ] && ! ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/", "", strip_tags($elements[ $i ][ 'content' ]))))) {
break; }
if ($elements[ $i ][ 'left' ] < $elements[ $j ][ 'left' ] || ($elements[ $i ][ 'type' ] == 'title' && is_null($elements[ $j ][ 'type' ]))) {
$elements[ $i ] = $this->handlePossibleChild($elements[ $i ], $elements[ $j ]);
$alreadyHandledIndexes[] = $j; } else {
break; } } if (! in_array($elements[ $i ][ 'type' ], ['header', 'footer'])) { $build[] = $elements[ $i ];
} $alreadyHandledIndexes[] = $i;
}
return $build; }
/** * Handle each node child's * * @param $parent * @param $child * * @return mixed */ protected function handlePossibleChild($parent, $child) {
// 1
// 1.1
// 1.1.1
// 2
// Must iterate through parent children
if (count($parent[ 'children' ]) === 0) { $parent[ 'children' ][] = $child;
return $parent; }
$lastParentChild = last($parent[ 'children' ]);
// Possible to be either child or grandchild
if ($child[ 'left' ] > $lastParentChild[ 'left' ]) {
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child); } elseif ($child[ 'left' ] === $parent[ 'left' ] && $parent[ 'type' ] == 'title' && is_null($child[ 'type' ])) {
$parent[ 'children' ][] = $child;
return $parent;
} else { if ($child[ 'left' ] === $lastParentChild[ 'left' ]) { $parent[ 'children' ][] = $child;
return $parent; } }
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
return $parent; }
/** * Set's the title threshold * * @param $elements */ protected function setTitleThreshold($elements) { $nextElement = null; foreach ($elements as $index => $element) { if ($index + 1 < count($elements) && ! isset($this->titleFontThreshold)) { $nextElement = $elements[ $index + 1 ]; if ((isset($current->b) || $index == 0 || (! is_null($nextElement) && (int) $element[ 'font' ] < (int) $nextElement[ 'font' ]))) { $this->titleFontThreshold = (int) $element[ 'font' ]; } } else { continue; } }
}
/** * Set's the header and footer threshold * * @param $elements */ protected function setHeaderFooterThreshold($elements) { foreach ($elements as $index => $element) { if (isset($elements[ $index + 1 ]) && ! isset($this->headerFontFooterThreshold)) { $nextElement = $elements[ $index + 1 ]; if (! isset($nextElement[ 'type' ]) && $element[ 'top' ] > $nextElement[ 'top' ]) { $this->headerFontFooterThreshold = $nextElement[ 'font' ]; } } else { continue; } }
}
}
|