Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

406 lines
14 KiB

namespace App\Parser;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use SimpleXMLElement;
class ParseXml
* @var int
private $titleFontThreshold;
* @var int
private $headerFontFooterThreshold;
* ParseXml constructor.
public function __construct()
$this->headerFontFooterThreshold = null;
$this->titleFontThreshold = null;
* Handle xml files
* @param $xmlFile
* @return mixed
public function handle($xmlFile)
if (is_string($xmlFile)) {
try {
$storageDisk = Storage::disk('contracts');
while (! $storageDisk->exists($xmlFile)) {
//Sleep if file not yet written
$file = $storageDisk->get($xmlFile);
} catch (\Exception $exception) {
Log::error('Failed to load the xml file '.$exception->getMessage());
} else {
$file = file_get_contents($xmlFile);
//foreach (simplexml_load_string($file) as $key =>$xmlElementPage){
// dd($xmlElementPage);
return $this->buildChildStructure($this->handleElements(simplexml_load_string($file)->xpath('//text')));
* @param $element
* @return mixed
private function handleElements($element)
if (is_array($element)) {
$elements = $element;
} else {
$elements = (array) $element;
//dd(!in_array(trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))),['and','or']),trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))));
$numberOfNodes = count($elements);
$rows = [];
for ($i = 0; $i < $numberOfNodes; $i++) {
$current = $elements[ $i ];
$listContent = [];
if ($current instanceof SimpleXMLElement) {
$content = $this->getNodeContent($current);
//if(strpos($content,'Provided that the Customer has continued to pay ')!==false){
// dd(($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) && (((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
// || (isset($elements[ $i + 1 ]) && ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))), substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))));
$parentNumbering = [];
while ($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) &&
(((((((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
|| (ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1)))
|| (! in_array(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0, 1), [',']))
|| (ctype_lower(substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1))))
&& ! in_array(substr(trim(str_replace(['and','or'], '', $content)), strlen(trim(str_replace(['and', 'or'], '', $content))) - 1),['!', '.', '?', ';', '_', ':', ')'])
&& ! preg_match('/^.*?\-[^\d]*(\d+)[^\d]*\-.*$/',$content)
&& (substr(trim($this->getNodeContent($elements[ $i + 1 ])), 0,strlen('<b>')) !== '<b>'
&& ctype_lower((substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1)))))
|| ((int) $elements[ $i ][ 'top' ] === (int) $elements[ $i + 1 ][ 'top' ]))
|| (isset($elements[ $i + 1 ]) && trim(strip_tags($this->getNodeContent($elements[ $i+1])))=='[')
) {
// dd($parentNumbering,$content);
preg_replace('/[^0-9\.)]/', '', substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))),
0, 5)), $childNumbering);
if (! $childNumbering) {
preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))),
0, 5)), $parentNumbering);
//if($childNumbering && strpos($childNumbering[0],"2.1.5")!==false){
// dd(11,$content,$elements[$i],$i,$i+1);
$nextElement = $elements[ $i + 1 ];
$nextElementContent = $this->getNodeContent($nextElement);
$content .= ' '.$nextElementContent;
$current[ 'top' ] = $nextElement[ 'top' ];
$current[ 'height' ] = $nextElement[ 'height' ];
if (count($parentNumbering)) {
$current[ 'row_numbering' ] = $parentNumbering[ 0 ];
$content = str_replace($current[ 'row_numbering' ], '', $content);
} elseif ($childNumbering) {
$current[ 'row_numbering' ] = $childNumbering[ 0 ];
$content = str_replace($current[ 'row_numbering' ], '', $content);
if (strlen(trim(strip_tags($content))) && ! in_array(substr(trim(strip_tags($content)),
strlen(trim(strip_tags($content))) - 1),
['.', ':', '!', '?','[',',']) && !ctype_lower(substr(trim(strip_tags($content)),
strlen(trim(strip_tags($content)))-1)) && (!ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))),
0, 1)) || !in_array(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))), 0, 1),
['[', '{']))) {
if( ! empty($current[ 'row_numbering' ]) && ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/",
"", strip_tags($this->getNodeContent($elements[$i])))))){
//$current[ 'font' ] = $nextElement[ 'font' ];
$data = $this->extractNumbering($content);
$content = [
'type' => (int) $current[ 'font' ] === $this->titleFontThreshold ? 'title' : null,
'content' => $data[ 'content' ],
'numbering' => (! empty($current[ 'row_numbering' ])) ? (int)$current[ 'row_numbering' ] : $data[ 'numbering' ],
'top' => (int) $current[ 'top' ],
'height' => (int) $current[ 'height' ],
'left' => (int) $current[ 'left' ],
'font' => (int) $current[ 'font' ],
'children' => $listContent
$rows[] = $content;
return $rows;
* Returns the xml node content
* @param $node
* @return string|string[]|null
private function getNodeContent($node)
return preg_replace('!\s+!', ' ', preg_match_all("/<text.*?>(.*?)<\/text>/", $node->asXML(),
$matches) ? $matches[ 1 ] ? $matches[ 1 ][ 0 ] : '' : '');
* Extract the numbering if exists from the string
* @param $content
* @return array
private function extractNumbering($content)
$regexOne = '/^(([a-zA-Z0-9]+[.\)])+)([ ]|[a-z]|[A-Z])/';
$regexTwo = '/^(([\d\.]+)\d)/';
if (preg_match($regexOne, $content, $n)) {
$numbering = trim(last($n));
} else {
if (preg_match($regexTwo, $content, $n)) {
$numbering = trim(last($n));
} else {
$numbering = '';
if (strlen($numbering) > 1) {
return [
'content' => '<p>'.trim(str_replace($numbering, '', $content)).'</p>',
'numbering' => $numbering
return [
'content' => '<p>'.trim($content).'</p>',
'numbering' => ''
* Build the structure as required by the editor and the gamification module
* @param $elements
* @return array
private function buildChildStructure($elements)
$alreadyHandledIndexes = [];
$build = [];
// 0 1 2 3 4 5 6
// 1 1.1 1.1.1 1.2 1.2.1 1.3 1.3.1 2 3 4 4.1 4.2 5 6
for ($i = 0; $i < count($elements) - 1; $i++) {
if (! isset($elements[ $i ][ 'type' ])) {
if ($elements[ $i ][ 'top' ] < 100) {
$elements[ $i ][ 'type' ] = 'header';
} elseif ($elements[ $i ][ 'top' ] > 1150) {
$elements[ $i ][ 'type' ] = 'footer';
if (in_array($i, $alreadyHandledIndexes)) {
if (isset($elements[ $i ][ 'type' ]) && in_array($elements[ $i ][ 'type' ], ['footer', 'header'])) {
for ($j = $i + 1; $j < count($elements); $j++) {
if (! isset($elements[ $j ][ 'type' ])) {
if ($elements[ $j ][ 'top' ] < 100) {
$elements[ $j ][ 'type' ] = 'header';
} elseif ($elements[ $j ][ 'top' ] > 1150) {
$elements[ $j ][ 'type' ] = 'footer';
if (in_array($j, $alreadyHandledIndexes)) {
if (isset($elements[ $j ][ 'type' ]) && in_array($elements[ $j ][ 'type' ], ['footer', 'header'])) {
if ($elements[ $j ][ 'type' ] === 'title' && $elements[ $i ][ 'top' ] !== $elements[ $j ][ 'top' ] && ! ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/",
"", strip_tags($elements[ $i ][ 'content' ]))))) {
if ($elements[ $i ][ 'left' ] < $elements[ $j ][ 'left' ] || ($elements[ $i ][ 'type' ] == 'title' && is_null($elements[ $j ][ 'type' ]))) {
$elements[ $i ] = $this->handlePossibleChild($elements[ $i ], $elements[ $j ]);
$alreadyHandledIndexes[] = $j;
} else {
if (! in_array($elements[ $i ][ 'type' ], ['header', 'footer'])) {
$build[] = $elements[ $i ];
$alreadyHandledIndexes[] = $i;
return $build;
* Handle each node child's
* @param $parent
* @param $child
* @return mixed
protected function handlePossibleChild($parent, $child)
// 1
// 1.1
// 1.1.1
// 2
// Must iterate through parent children
if (count($parent[ 'children' ]) === 0) {
$parent[ 'children' ][] = $child;
return $parent;
$lastParentChild = last($parent[ 'children' ]);
// Possible to be either child or grandchild
if ($child[ 'left' ] > $lastParentChild[ 'left' ]) {
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
} elseif ($child[ 'left' ] === $parent[ 'left' ] && $parent[ 'type' ] == 'title' && is_null($child[ 'type' ])) {
$parent[ 'children' ][] = $child;
return $parent;
} else {
if ($child[ 'left' ] === $lastParentChild[ 'left' ]) {
$parent[ 'children' ][] = $child;
return $parent;
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
return $parent;
* Set's the title threshold
* @param $elements
protected function setTitleThreshold($elements)
$nextElement = null;
foreach ($elements as $index => $element) {
if ($index + 1 < count($elements) && ! isset($this->titleFontThreshold)) {
$nextElement = $elements[ $index + 1 ];
if ((isset($current->b) || $index == 0 || (! is_null($nextElement) && (int) $element[ 'font' ] < (int) $nextElement[ 'font' ]))) {
$this->titleFontThreshold = (int) $element[ 'font' ];
} else {
* Set's the header and footer threshold
* @param $elements
protected function setHeaderFooterThreshold($elements)
foreach ($elements as $index => $element) {
if (isset($elements[ $index + 1 ]) && ! isset($this->headerFontFooterThreshold)) {
$nextElement = $elements[ $index + 1 ];
if (! isset($nextElement[ 'type' ]) && $element[ 'top' ] > $nextElement[ 'top' ]) {
$this->headerFontFooterThreshold = $nextElement[ 'font' ];
} else {