Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

406 lines
14 KiB

<?php
namespace App\Parser;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use SimpleXMLElement;
class ParseXml
{
/**
* @var int
*/
private $titleFontThreshold;
/**
* @var int
*/
private $headerFontFooterThreshold;
/**
* ParseXml constructor.
*/
public function __construct()
{
$this->headerFontFooterThreshold = null;
$this->titleFontThreshold = null;
}
/**
* Handle xml files
*
* @param $xmlFile
*
* @return mixed
*/
public function handle($xmlFile)
{
if (is_string($xmlFile)) {
try {
$storageDisk = Storage::disk('contracts');
while (! $storageDisk->exists($xmlFile)) {
//Sleep if file not yet written
sleep(1);
}
$file = $storageDisk->get($xmlFile);
} catch (\Exception $exception) {
Log::error('Failed to load the xml file '.$exception->getMessage());
}
} else {
$file = file_get_contents($xmlFile);
}
//foreach (simplexml_load_string($file) as $key =>$xmlElementPage){
// dd($xmlElementPage);
//}
return $this->buildChildStructure($this->handleElements(simplexml_load_string($file)->xpath('//text')));
}
/**
* @param $element
*
* @return mixed
*/
private function handleElements($element)
{
if (is_array($element)) {
$elements = $element;
} else {
$elements = (array) $element;
}
//dd(!in_array(trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))),['and','or']),trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))));
$this->setTitleThreshold($elements);
$numberOfNodes = count($elements);
$rows = [];
for ($i = 0; $i < $numberOfNodes; $i++) {
$current = $elements[ $i ];
$listContent = [];
if ($current instanceof SimpleXMLElement) {
$content = $this->getNodeContent($current);
//if(strpos($content,'Provided that the Customer has continued to pay ')!==false){
// dd(($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) && (((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
// || (isset($elements[ $i + 1 ]) && ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))), substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))));
//}
$parentNumbering = [];
while ($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) &&
(((((((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
|| (ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1)))
|| (! in_array(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0, 1), [',']))
|| (ctype_lower(substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1))))
&& ! in_array(substr(trim(str_replace(['and','or'], '', $content)), strlen(trim(str_replace(['and', 'or'], '', $content))) - 1),['!', '.', '?', ';', '_', ':', ')'])
&& ! preg_match('/^.*?\-[^\d]*(\d+)[^\d]*\-.*$/',$content)
&& (substr(trim($this->getNodeContent($elements[ $i + 1 ])), 0,strlen('<b>')) !== '<b>'
&& ctype_lower((substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1)))))
|| ((int) $elements[ $i ][ 'top' ] === (int) $elements[ $i + 1 ][ 'top' ]))
|| (isset($elements[ $i + 1 ]) && trim(strip_tags($this->getNodeContent($elements[ $i+1])))=='[')
) {
//if($parentNumbering){
// dd($parentNumbering,$content);
//}
preg_match('/^([-+]?\d*\.?\d+)(?:[-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/',
preg_replace('/[^0-9\.)]/', '', substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))),
0, 5)), $childNumbering);
if (! $childNumbering) {
preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))),
0, 5)), $parentNumbering);
}
//if($childNumbering && strpos($childNumbering[0],"2.1.5")!==false){
// dd(11,$content,$elements[$i],$i,$i+1);
//}
$nextElement = $elements[ $i + 1 ];
$nextElementContent = $this->getNodeContent($nextElement);
$content .= ' '.$nextElementContent;
$current[ 'top' ] = $nextElement[ 'top' ];
$current[ 'height' ] = $nextElement[ 'height' ];
if (count($parentNumbering)) {
$current[ 'row_numbering' ] = $parentNumbering[ 0 ];
$content = str_replace($current[ 'row_numbering' ], '', $content);
$i++;
break;
} elseif ($childNumbering) {
$current[ 'row_numbering' ] = $childNumbering[ 0 ];
$content = str_replace($current[ 'row_numbering' ], '', $content);
if (strlen(trim(strip_tags($content))) && ! in_array(substr(trim(strip_tags($content)),
strlen(trim(strip_tags($content))) - 1),
['.', ':', '!', '?','[',',']) && !ctype_lower(substr(trim(strip_tags($content)),
strlen(trim(strip_tags($content)))-1)) && (!ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))),
0, 1)) || !in_array(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))), 0, 1),
['[', '{']))) {
$i++;
break;
}
}
if( ! empty($current[ 'row_numbering' ]) && ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/",
"", strip_tags($this->getNodeContent($elements[$i])))))){
$i++;
break;
}
//$current[ 'font' ] = $nextElement[ 'font' ];
$i++;
continue;
}
$data = $this->extractNumbering($content);
$content = [
'type' => (int) $current[ 'font' ] === $this->titleFontThreshold ? 'title' : null,
'content' => $data[ 'content' ],
'numbering' => (! empty($current[ 'row_numbering' ])) ? (int)$current[ 'row_numbering' ] : $data[ 'numbering' ],
'top' => (int) $current[ 'top' ],
'height' => (int) $current[ 'height' ],
'left' => (int) $current[ 'left' ],
'font' => (int) $current[ 'font' ],
'children' => $listContent
];
$rows[] = $content;
}
}
return $rows;
}
/**
* Returns the xml node content
*
* @param $node
*
* @return string|string[]|null
*/
private function getNodeContent($node)
{
return preg_replace('!\s+!', ' ', preg_match_all("/<text.*?>(.*?)<\/text>/", $node->asXML(),
$matches) ? $matches[ 1 ] ? $matches[ 1 ][ 0 ] : '' : '');
}
/**
* Extract the numbering if exists from the string
*
* @param $content
*
* @return array
*/
private function extractNumbering($content)
{
$regexOne = '/^(([a-zA-Z0-9]+[.\)])+)([ ]|[a-z]|[A-Z])/';
$regexTwo = '/^(([\d\.]+)\d)/';
if (preg_match($regexOne, $content, $n)) {
$numbering = trim(last($n));
} else {
if (preg_match($regexTwo, $content, $n)) {
$numbering = trim(last($n));
} else {
$numbering = '';
}
}
if (strlen($numbering) > 1) {
return [
'content' => '<p>'.trim(str_replace($numbering, '', $content)).'</p>',
'numbering' => $numbering
];
}
return [
'content' => '<p>'.trim($content).'</p>',
'numbering' => ''
];
}
/**
* Build the structure as required by the editor and the gamification module
*
* @param $elements
*
* @return array
*/
private function buildChildStructure($elements)
{
$alreadyHandledIndexes = [];
$build = [];
// 0 1 2 3 4 5 6
// 1 1.1 1.1.1 1.2 1.2.1 1.3 1.3.1 2 3 4 4.1 4.2 5 6
for ($i = 0; $i < count($elements) - 1; $i++) {
if (! isset($elements[ $i ][ 'type' ])) {
if ($elements[ $i ][ 'top' ] < 100) {
$elements[ $i ][ 'type' ] = 'header';
} elseif ($elements[ $i ][ 'top' ] > 1150) {
$elements[ $i ][ 'type' ] = 'footer';
}
}
if (in_array($i, $alreadyHandledIndexes)) {
continue;
}
if (isset($elements[ $i ][ 'type' ]) && in_array($elements[ $i ][ 'type' ], ['footer', 'header'])) {
continue;
}
for ($j = $i + 1; $j < count($elements); $j++) {
if (! isset($elements[ $j ][ 'type' ])) {
if ($elements[ $j ][ 'top' ] < 100) {
$elements[ $j ][ 'type' ] = 'header';
} elseif ($elements[ $j ][ 'top' ] > 1150) {
$elements[ $j ][ 'type' ] = 'footer';
}
}
if (in_array($j, $alreadyHandledIndexes)) {
continue;
}
if (isset($elements[ $j ][ 'type' ]) && in_array($elements[ $j ][ 'type' ], ['footer', 'header'])) {
continue;
}
if ($elements[ $j ][ 'type' ] === 'title' && $elements[ $i ][ 'top' ] !== $elements[ $j ][ 'top' ] && ! ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/",
"", strip_tags($elements[ $i ][ 'content' ]))))) {
break;
}
if ($elements[ $i ][ 'left' ] < $elements[ $j ][ 'left' ] || ($elements[ $i ][ 'type' ] == 'title' && is_null($elements[ $j ][ 'type' ]))) {
$elements[ $i ] = $this->handlePossibleChild($elements[ $i ], $elements[ $j ]);
$alreadyHandledIndexes[] = $j;
} else {
break;
}
}
if (! in_array($elements[ $i ][ 'type' ], ['header', 'footer'])) {
$build[] = $elements[ $i ];
}
$alreadyHandledIndexes[] = $i;
}
return $build;
}
/**
* Handle each node child's
*
* @param $parent
* @param $child
*
* @return mixed
*/
protected function handlePossibleChild($parent, $child)
{
// 1
// 1.1
// 1.1.1
// 2
// Must iterate through parent children
if (count($parent[ 'children' ]) === 0) {
$parent[ 'children' ][] = $child;
return $parent;
}
$lastParentChild = last($parent[ 'children' ]);
// Possible to be either child or grandchild
if ($child[ 'left' ] > $lastParentChild[ 'left' ]) {
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
} elseif ($child[ 'left' ] === $parent[ 'left' ] && $parent[ 'type' ] == 'title' && is_null($child[ 'type' ])) {
$parent[ 'children' ][] = $child;
return $parent;
} else {
if ($child[ 'left' ] === $lastParentChild[ 'left' ]) {
$parent[ 'children' ][] = $child;
return $parent;
}
}
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
return $parent;
}
/**
* Set's the title threshold
*
* @param $elements
*/
protected function setTitleThreshold($elements)
{
$nextElement = null;
foreach ($elements as $index => $element) {
if ($index + 1 < count($elements) && ! isset($this->titleFontThreshold)) {
$nextElement = $elements[ $index + 1 ];
if ((isset($current->b) || $index == 0 || (! is_null($nextElement) && (int) $element[ 'font' ] < (int) $nextElement[ 'font' ]))) {
$this->titleFontThreshold = (int) $element[ 'font' ];
}
} else {
continue;
}
}
}
/**
* Set's the header and footer threshold
*
* @param $elements
*/
protected function setHeaderFooterThreshold($elements)
{
foreach ($elements as $index => $element) {
if (isset($elements[ $index + 1 ]) && ! isset($this->headerFontFooterThreshold)) {
$nextElement = $elements[ $index + 1 ];
if (! isset($nextElement[ 'type' ]) && $element[ 'top' ] > $nextElement[ 'top' ]) {
$this->headerFontFooterThreshold = $nextElement[ 'font' ];
}
} else {
continue;
}
}
}
}