You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
406 lines
14 KiB
406 lines
14 KiB
<?php
|
|
|
|
namespace App\Parser;
|
|
|
|
use Illuminate\Support\Facades\Log;
|
|
use Illuminate\Support\Facades\Storage;
|
|
use SimpleXMLElement;
|
|
|
|
class ParseXml
|
|
{
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
private $titleFontThreshold;
|
|
|
|
/**
|
|
* @var int
|
|
*/
|
|
private $headerFontFooterThreshold;
|
|
|
|
|
|
/**
|
|
* ParseXml constructor.
|
|
*/
|
|
public function __construct()
|
|
{
|
|
$this->headerFontFooterThreshold = null;
|
|
$this->titleFontThreshold = null;
|
|
}
|
|
|
|
|
|
/**
|
|
* Handle xml files
|
|
*
|
|
* @param $xmlFile
|
|
*
|
|
* @return mixed
|
|
*/
|
|
public function handle($xmlFile)
|
|
{
|
|
if (is_string($xmlFile)) {
|
|
try {
|
|
$storageDisk = Storage::disk('contracts');
|
|
while (! $storageDisk->exists($xmlFile)) {
|
|
//Sleep if file not yet written
|
|
sleep(1);
|
|
}
|
|
$file = $storageDisk->get($xmlFile);
|
|
} catch (\Exception $exception) {
|
|
Log::error('Failed to load the xml file '.$exception->getMessage());
|
|
}
|
|
} else {
|
|
$file = file_get_contents($xmlFile);
|
|
}
|
|
//foreach (simplexml_load_string($file) as $key =>$xmlElementPage){
|
|
// dd($xmlElementPage);
|
|
//}
|
|
return $this->buildChildStructure($this->handleElements(simplexml_load_string($file)->xpath('//text')));
|
|
|
|
}
|
|
|
|
|
|
|
|
/**
|
|
* @param $element
|
|
*
|
|
* @return mixed
|
|
*/
|
|
private function handleElements($element)
|
|
{
|
|
if (is_array($element)) {
|
|
$elements = $element;
|
|
} else {
|
|
$elements = (array) $element;
|
|
}
|
|
//dd(!in_array(trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))),['and','or']),trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))));
|
|
$this->setTitleThreshold($elements);
|
|
$numberOfNodes = count($elements);
|
|
$rows = [];
|
|
for ($i = 0; $i < $numberOfNodes; $i++) {
|
|
$current = $elements[ $i ];
|
|
$listContent = [];
|
|
if ($current instanceof SimpleXMLElement) {
|
|
$content = $this->getNodeContent($current);
|
|
//if(strpos($content,'Provided that the Customer has continued to pay ')!==false){
|
|
// dd(($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) && (((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
|
|
// || (isset($elements[ $i + 1 ]) && ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))), substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))));
|
|
//}
|
|
$parentNumbering = [];
|
|
|
|
while ($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) &&
|
|
(((((((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
|
|
|| (ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1)))
|
|
|| (! in_array(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0, 1), [',']))
|
|
|| (ctype_lower(substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1))))
|
|
&& ! in_array(substr(trim(str_replace(['and','or'], '', $content)), strlen(trim(str_replace(['and', 'or'], '', $content))) - 1),['!', '.', '?', ';', '_', ':', ')'])
|
|
&& ! preg_match('/^.*?\-[^\d]*(\d+)[^\d]*\-.*$/',$content)
|
|
&& (substr(trim($this->getNodeContent($elements[ $i + 1 ])), 0,strlen('<b>')) !== '<b>'
|
|
&& ctype_lower((substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1)))))
|
|
|| ((int) $elements[ $i ][ 'top' ] === (int) $elements[ $i + 1 ][ 'top' ]))
|
|
|| (isset($elements[ $i + 1 ]) && trim(strip_tags($this->getNodeContent($elements[ $i+1])))=='[')
|
|
) {
|
|
//if($parentNumbering){
|
|
// dd($parentNumbering,$content);
|
|
//}
|
|
|
|
preg_match('/^([-+]?\d*\.?\d+)(?:[-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/',
|
|
preg_replace('/[^0-9\.)]/', '', substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
|
|
preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))),
|
|
0, 5)), $childNumbering);
|
|
if (! $childNumbering) {
|
|
preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
|
|
substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
|
|
preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))),
|
|
0, 5)), $parentNumbering);
|
|
}
|
|
//if($childNumbering && strpos($childNumbering[0],"2.1.5")!==false){
|
|
// dd(11,$content,$elements[$i],$i,$i+1);
|
|
//}
|
|
|
|
$nextElement = $elements[ $i + 1 ];
|
|
$nextElementContent = $this->getNodeContent($nextElement);
|
|
$content .= ' '.$nextElementContent;
|
|
$current[ 'top' ] = $nextElement[ 'top' ];
|
|
$current[ 'height' ] = $nextElement[ 'height' ];
|
|
|
|
|
|
|
|
if (count($parentNumbering)) {
|
|
$current[ 'row_numbering' ] = $parentNumbering[ 0 ];
|
|
$content = str_replace($current[ 'row_numbering' ], '', $content);
|
|
$i++;
|
|
|
|
break;
|
|
|
|
} elseif ($childNumbering) {
|
|
$current[ 'row_numbering' ] = $childNumbering[ 0 ];
|
|
$content = str_replace($current[ 'row_numbering' ], '', $content);
|
|
if (strlen(trim(strip_tags($content))) && ! in_array(substr(trim(strip_tags($content)),
|
|
strlen(trim(strip_tags($content))) - 1),
|
|
['.', ':', '!', '?','[',',']) && !ctype_lower(substr(trim(strip_tags($content)),
|
|
strlen(trim(strip_tags($content)))-1)) && (!ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))),
|
|
0, 1)) || !in_array(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))), 0, 1),
|
|
['[', '{']))) {
|
|
$i++;
|
|
|
|
|
|
break;
|
|
}
|
|
|
|
|
|
|
|
}
|
|
if( ! empty($current[ 'row_numbering' ]) && ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/",
|
|
"", strip_tags($this->getNodeContent($elements[$i])))))){
|
|
|
|
$i++;
|
|
break;
|
|
}
|
|
//$current[ 'font' ] = $nextElement[ 'font' ];
|
|
$i++;
|
|
|
|
continue;
|
|
}
|
|
|
|
$data = $this->extractNumbering($content);
|
|
|
|
$content = [
|
|
'type' => (int) $current[ 'font' ] === $this->titleFontThreshold ? 'title' : null,
|
|
'content' => $data[ 'content' ],
|
|
'numbering' => (! empty($current[ 'row_numbering' ])) ? (int)$current[ 'row_numbering' ] : $data[ 'numbering' ],
|
|
'top' => (int) $current[ 'top' ],
|
|
'height' => (int) $current[ 'height' ],
|
|
'left' => (int) $current[ 'left' ],
|
|
'font' => (int) $current[ 'font' ],
|
|
'children' => $listContent
|
|
];
|
|
|
|
$rows[] = $content;
|
|
}
|
|
|
|
}
|
|
|
|
return $rows;
|
|
}
|
|
|
|
|
|
/**
|
|
* Returns the xml node content
|
|
*
|
|
* @param $node
|
|
*
|
|
* @return string|string[]|null
|
|
*/
|
|
private function getNodeContent($node)
|
|
{
|
|
|
|
return preg_replace('!\s+!', ' ', preg_match_all("/<text.*?>(.*?)<\/text>/", $node->asXML(),
|
|
$matches) ? $matches[ 1 ] ? $matches[ 1 ][ 0 ] : '' : '');
|
|
|
|
}
|
|
|
|
|
|
/**
|
|
* Extract the numbering if exists from the string
|
|
*
|
|
* @param $content
|
|
*
|
|
* @return array
|
|
*/
|
|
private function extractNumbering($content)
|
|
{
|
|
$regexOne = '/^(([a-zA-Z0-9]+[.\)])+)([ ]|[a-z]|[A-Z])/';
|
|
$regexTwo = '/^(([\d\.]+)\d)/';
|
|
|
|
if (preg_match($regexOne, $content, $n)) {
|
|
|
|
$numbering = trim(last($n));
|
|
} else {
|
|
if (preg_match($regexTwo, $content, $n)) {
|
|
$numbering = trim(last($n));
|
|
} else {
|
|
$numbering = '';
|
|
}
|
|
}
|
|
if (strlen($numbering) > 1) {
|
|
return [
|
|
'content' => '<p>'.trim(str_replace($numbering, '', $content)).'</p>',
|
|
'numbering' => $numbering
|
|
];
|
|
}
|
|
return [
|
|
'content' => '<p>'.trim($content).'</p>',
|
|
'numbering' => ''
|
|
];
|
|
}
|
|
|
|
|
|
/**
|
|
* Build the structure as required by the editor and the gamification module
|
|
*
|
|
* @param $elements
|
|
*
|
|
* @return array
|
|
*/
|
|
private function buildChildStructure($elements)
|
|
{
|
|
$alreadyHandledIndexes = [];
|
|
$build = [];
|
|
|
|
// 0 1 2 3 4 5 6
|
|
// 1 1.1 1.1.1 1.2 1.2.1 1.3 1.3.1 2 3 4 4.1 4.2 5 6
|
|
|
|
for ($i = 0; $i < count($elements) - 1; $i++) {
|
|
if (! isset($elements[ $i ][ 'type' ])) {
|
|
if ($elements[ $i ][ 'top' ] < 100) {
|
|
$elements[ $i ][ 'type' ] = 'header';
|
|
} elseif ($elements[ $i ][ 'top' ] > 1150) {
|
|
$elements[ $i ][ 'type' ] = 'footer';
|
|
}
|
|
}
|
|
if (in_array($i, $alreadyHandledIndexes)) {
|
|
continue;
|
|
}
|
|
if (isset($elements[ $i ][ 'type' ]) && in_array($elements[ $i ][ 'type' ], ['footer', 'header'])) {
|
|
continue;
|
|
}
|
|
|
|
for ($j = $i + 1; $j < count($elements); $j++) {
|
|
|
|
if (! isset($elements[ $j ][ 'type' ])) {
|
|
if ($elements[ $j ][ 'top' ] < 100) {
|
|
$elements[ $j ][ 'type' ] = 'header';
|
|
} elseif ($elements[ $j ][ 'top' ] > 1150) {
|
|
$elements[ $j ][ 'type' ] = 'footer';
|
|
}
|
|
}
|
|
if (in_array($j, $alreadyHandledIndexes)) {
|
|
continue;
|
|
}
|
|
if (isset($elements[ $j ][ 'type' ]) && in_array($elements[ $j ][ 'type' ], ['footer', 'header'])) {
|
|
continue;
|
|
}
|
|
if ($elements[ $j ][ 'type' ] === 'title' && $elements[ $i ][ 'top' ] !== $elements[ $j ][ 'top' ] && ! ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/",
|
|
"", strip_tags($elements[ $i ][ 'content' ]))))) {
|
|
|
|
break;
|
|
}
|
|
|
|
if ($elements[ $i ][ 'left' ] < $elements[ $j ][ 'left' ] || ($elements[ $i ][ 'type' ] == 'title' && is_null($elements[ $j ][ 'type' ]))) {
|
|
|
|
$elements[ $i ] = $this->handlePossibleChild($elements[ $i ], $elements[ $j ]);
|
|
|
|
$alreadyHandledIndexes[] = $j;
|
|
} else {
|
|
|
|
break;
|
|
}
|
|
}
|
|
if (! in_array($elements[ $i ][ 'type' ], ['header', 'footer'])) {
|
|
$build[] = $elements[ $i ];
|
|
|
|
}
|
|
$alreadyHandledIndexes[] = $i;
|
|
|
|
}
|
|
|
|
return $build;
|
|
}
|
|
|
|
|
|
/**
|
|
* Handle each node child's
|
|
*
|
|
* @param $parent
|
|
* @param $child
|
|
*
|
|
* @return mixed
|
|
*/
|
|
protected function handlePossibleChild($parent, $child)
|
|
{
|
|
|
|
// 1
|
|
// 1.1
|
|
// 1.1.1
|
|
// 2
|
|
|
|
|
|
|
|
// Must iterate through parent children
|
|
if (count($parent[ 'children' ]) === 0) {
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
return $parent;
|
|
}
|
|
|
|
$lastParentChild = last($parent[ 'children' ]);
|
|
|
|
// Possible to be either child or grandchild
|
|
if ($child[ 'left' ] > $lastParentChild[ 'left' ]) {
|
|
|
|
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
|
|
} elseif ($child[ 'left' ] === $parent[ 'left' ] && $parent[ 'type' ] == 'title' && is_null($child[ 'type' ])) {
|
|
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
return $parent;
|
|
|
|
} else {
|
|
if ($child[ 'left' ] === $lastParentChild[ 'left' ]) {
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
return $parent;
|
|
}
|
|
}
|
|
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
}
|
|
|
|
|
|
/**
|
|
* Set's the title threshold
|
|
*
|
|
* @param $elements
|
|
*/
|
|
protected function setTitleThreshold($elements)
|
|
{
|
|
$nextElement = null;
|
|
foreach ($elements as $index => $element) {
|
|
if ($index + 1 < count($elements) && ! isset($this->titleFontThreshold)) {
|
|
$nextElement = $elements[ $index + 1 ];
|
|
if ((isset($current->b) || $index == 0 || (! is_null($nextElement) && (int) $element[ 'font' ] < (int) $nextElement[ 'font' ]))) {
|
|
$this->titleFontThreshold = (int) $element[ 'font' ];
|
|
}
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
|
|
/**
|
|
* Set's the header and footer threshold
|
|
*
|
|
* @param $elements
|
|
*/
|
|
protected function setHeaderFooterThreshold($elements)
|
|
{
|
|
foreach ($elements as $index => $element) {
|
|
if (isset($elements[ $index + 1 ]) && ! isset($this->headerFontFooterThreshold)) {
|
|
$nextElement = $elements[ $index + 1 ];
|
|
if (! isset($nextElement[ 'type' ]) && $element[ 'top' ] > $nextElement[ 'top' ]) {
|
|
$this->headerFontFooterThreshold = $nextElement[ 'font' ];
|
|
}
|
|
} else {
|
|
continue;
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
}
|