Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

670 lines
32 KiB

<?php
namespace App\Parser;
use Illuminate\Support\Facades\Log;
class ParseHtmlArray
{
public function fromFile($filePath)
{
if (file_exists($filePath)) {
$fileContent = file_get_contents($filePath);
$fileContent = str_replace('},
]', "}
]", $fileContent);
return $this->handle(json_decode($fileContent,true));
} else {
Log::error('The given file dose not exists!');
}
}
public function handle($docxAsHtmlArray)
{
$response=[];
foreach ($docxAsHtmlArray as $i => $array) {
$response = array_merge($response, $this->handleTestHtml($array));
}
return $this->buildTheStructure($response);
}
private function buildTheStructure($data)
{
$response = [];
$alreadyHandled = [];
$numbers = [];
for ($i = 0; $i < count($data); $i++) {
if (array_key_exists($i, $alreadyHandled)) {
continue;
}
$parent = $data[ $i ];
//get numbering from first 10 chars of the string
preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '', preg_replace('/\)/', '.',
preg_replace("/\{.+/", "", html_entity_decode($data[ $i ][ 'content' ]))))), 0, 5)),
$parentNumbering);
if ($parentNumbering && count($numbers) == 0 && last($parentNumbering) < 5) {
$numbers[] = $parentNumbering[ 0 ];
$data[ $i ][ 'numbering' ] = rtrim($parentNumbering[ 0 ], '.');
} elseif ($parentNumbering && count($numbers) > 0 && $parentNumbering[ 0 ] >= last($numbers)) {
$numbers[] = $parentNumbering[ 0 ];
$data[ $i ][ 'numbering' ] = rtrim($parentNumbering[ 0 ], '.');
}
//check if string starts with bold
//check if number of bolds equals to 1
//check if not empty html and contains words
if ((strpos($parent[ 'content' ], "<b>") === 0 || (substr_count($parent[ 'content' ],
"<b>") == 1 || $parentNumbering) && strlen(trim(strip_tags($parent[ 'content' ]))) > 0) || (str_word_count(preg_replace('/[A-Za-z]{4,}/',
'', strip_tags($data[ $i ][ 'content' ]))) < 2)) {
$childNumbers = [];
$j = $i + 1;
//check if data exists
if (isset($data[ $j ]) && strlen($data[ $j ][ 'content' ])) {
for ($j; $j < count($data); $j++) {
if ($data[ $j ][ 'content' ] == '\u00a0') {
$alreadyHandled[] = $j;
}
if (array_key_exists($j, $alreadyHandled)) {
continue;
}
$child = $data[ $j ];
preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/',
substr(trim(urldecode(str_replace(['<b>', '</b>'], '',
strip_tags($data[ $j ][ 'content' ])))), 0, 5), $childNumbering);
if ($childNumbering && ! preg_match("/[a-z]/i", rtrim(trim($childNumbering[ 0 ])))) {
if ($childNumbering && count($childNumbers) == 0 && trim($childNumbering[ 0 ]) < 5) {
$childNumbers[] = trim($childNumbering[ 0 ]);
$data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
} elseif ($childNumbering && count($childNumbers) > 0 && trim($childNumbering[ 0 ]) >= last($childNumbers)) {
$childNumbers[] = trim($childNumbering[ 0 ]);
$data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
} elseif ($childNumbering && trim($childNumbering[ 0 ]) < 100) {
$childNumbers[] = trim($childNumbering[ 0 ]);
$data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
}
}
if (empty(trim($data[ $i ][ 'content' ])) && isset($data[ $j ][ 'numbering' ])) {
break;
}
$breakPoints = array_change_key_case([
'TERMS OF THE {P1_Pros}',
'TERMS AND CONDITIONS',
'BACKGROUND',
'OPERATIVE PROVISIONS',
'Products and/or Services',
'PAYMENT',
'GRANT OF LICENCE',
'TERM OF LICENCE AGREEMENT',
'ROYALTY',
'PAYMENT',
'PERFORMANCE TARGETS',
'STATIONERY',
'QUALITY CONTROL',
'THE DISTRIBUTOR\'S OBLIGATIONS',
'NON SOLICITATION',
'SALE OF BUSINESS',
'TERMINATION OF AGREEMENT',
'CONDITIONS FOLLOWING TERMINATION',
'RESTRAINT',
'TIME OF ESSENCE AND NOTICES',
'INTERPRETATION',
'ARBITRATION',
'DOMICILIUM AND REGISTERED OFFICE',
'USE OF TRADE MARKS, TRADE NAME, GOODWILL AND KNOW-HOW',
'GENERAL',
'DESCRIPTION OF {P2_NAME} INFORMATION',
'PAYMENT OF FEES',
'SUPPLIER\'S STATUS',
'SUPPLIER\’S OBLIGATIONS',
'DEFINITIONS AND INTERPRETATION',
'DEFINITIONS',
'CONFIDENTIALITY',
'TERMINATION',
'RESTRICTIVE COVENANTS AND INTELLECTUAL PROPERTY',
'DETAILS AND IDENTITY OF CONSULTANT',
'ANTI-BRIBERY',
'ASSIGNMENT SCHEDULE',
'SCHEDULE 1',
'{P1_NAME}\'S LIABILITY',
'DURATION OF AGREEMENT AND SUPPLY',
'SUPPLY OF HARDWARE',
'SUPPLY OF SOFTWARE AND DOCUMENTATION',
'SUPPLY OF SUPPORT SERVICES',
'INTELLECTUAL PROPERTY RIGHTS',
'THE CONTRACT',
'{P1_NAME}\U2019S LIABILITY',
'UPDATES',
'TERMS OF THE {P1_NAME} PRODUCTS.',
'CUSTOMER RESPONSIBILITIES',
'EXHIBIT A',
'EXHIBIT A-1',
'EXHIBIT A-2',
'WARRANTIES',
'EXIT, TERMINATION AND SUSPENSION',
'EXHIBIT B',
'EXHIBIT B-1',
'EXHIBIT B-2',
'COUNTERPARTS',
'LICENSE GRANT',
'INDEMNIFICATION BY CUSTOMER',
'TERMS OF THE {P1_NAME} PRODUCTS',
'TERMS OF CLOUD SERVICE',
'INDEMNIFICATION BY CUSTOMER',
'TERMINATION',
'TERMS OF THE {P1_PROS}',
'SUPPORT',
'SUB CONTRACTING AND THIRD PARTY RECOMMENDATIONS',
'LICENCE AND ACCESS TO SOFTWARE AND HARDWARE',
'DECLARATION OF NON-LIAISON AND ANTI-CORRUPTION COMMITMENT',
'{P1_NAME}\'S DUTIES'
], CASE_UPPER);
//$breakPoints = [];
if ($this->paragraphBrake($data[ $j ], $breakPoints)) {
break;
}
if (substr(trim(str_replace(array_merge([')'], $childNumbering), '', $data[ $j ][ 'content' ])),
0, 3) == '<b>' && str_word_count(strip_tags(str_replace(array_merge([')'],
$childNumbering), '',
$data[ $j ][ 'content' ]))) == str_word_count($this->getTextBetweenTags(str_replace(array_merge([')',],
$childNumbering), '', $data[ $j ][ 'content' ]),
'b')) && (isset($data[ $j + 1 ]) && ((ctype_upper(substr($data[ $j + 1 ][ 'content' ],
0,
1)) || (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && $data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ] == 1))))) {
break;
}
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ctype_upper(str_replace(' ',
'', $data[ $j ][ 'content' ])) && str_word_count($data[ $j ][ 'content' ]) >= 1) {
break;
}
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ctype_upper(str_replace([
'<b>',
'</b>',
last($childNumbering),
last($childNumbering),
')',
'.'
], '', trim(str_replace(' ', '',
$data[ $j ][ 'content' ])))) && str_word_count($data[ $j ][ 'content' ]) >= 1) {
break;
}
//if(isset($data[$j]['numbering']) && isset($data[$i]['numbering']) && )
if (isset($data[ $i ][ 'children' ]) && isset($data[ $i ][ 'numbering' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ]) && isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && ($data[ $j ][ 'numbering' ] - last($data[ $i ][ 'children' ])[ 'numbering' ] !== 1 && $data[ $i ][ 'numbering' ] < $data[ $j ][ 'numbering' ]) && ! in_array(substr(strip_tags(last($data[ $i ][ 'children' ])[ 'content' ]),
strlen(strip_tags(last($data[ $i ][ 'children' ])[ 'content' ])) - 1),
[':', '-']) && ! strpos($data[ $j ][ 'numbering' ], '.')) {
break;
}
if (in_array(strtoupper(trim(str_replace([
'<b>',
'</b>',
last($parentNumbering),
last($parentNumbering),
')',
'.'
], '', strip_tags($data[ $i ][ 'content' ])))), $breakPoints)) {
if ((! isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && (substr($data[ $i ][ 'content' ],
0,
3) != '<b>') || (str_word_count(strip_tags($data[ $i ][ 'content' ])) != str_word_count($this->getTextBetweenTags($data[ $i ][ 'content' ],
'b'))))) {
if (! in_array($data[ $i ][ 'content' ], $breakPoints)) {
break;
}
}
}
if (in_array(strtoupper(trim(str_replace([
'<b>',
'</b>',
last($childNumbering),
last($childNumbering),
')',
'.'
], '', strip_tags($data[ $j ][ 'content' ])))), $breakPoints)) {
break;
}
if (in_array(substr(strip_tags($data[ $j ][ 'content' ]),
strlen(strip_tags($data[ $j ][ 'content' ])) - 1), [':', '-'])) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ctype_lower(substr(last($data[ $i ][ 'children' ])[ 'content' ],
strlen(last($data[ $i ][ 'children' ])[ 'content' ]) - 1)) && ctype_lower(substr(trim($data[ $j ][ 'content' ]),
0, 1))) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (str_word_count(preg_replace('/[A-Za-z]{4,}/', '',
strip_tags($data[ $j ][ 'content' ]))) < 3 && strlen(strip_tags($data[ $j ][ 'content' ])) && ! isset($data[ $j ][ 'numbering' ]) && ctype_upper(substr($data[ $j ][ 'content' ],
0, 1)) && str_word_count($data[ $j ][ 'content' ]) < 10) {
if (isset($data[ $i ][ 'children' ]) && ! in_array(substr(trim(last($data[ $i ][ 'children' ])[ 'content' ]),
strlen(trim(last($data[ $i ][ 'children' ])[ 'content' ])) - 1),
['!', '.', '?', '_', '}'])) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} else {
break;
}
//dd($data[$i]);
} elseif (str_word_count(preg_replace('/[A-Za-z]{4,}/', '',
strip_tags($data[ $i ][ 'content' ]))) < 2 && strlen(strip_tags($data[ $i ][ 'content' ]))) {
if (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && is_numeric($data[ $j ][ 'numbering' ]) && abs($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ]) == 1 && str_word_count($data[ $j ]
[ 'content' ]) < 6) {
break;
}
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ((str_word_count($data[ $j ]
[ 'content' ]) < 6) || (substr_count($data[ $j ][ 'content' ],
'<b>') == 1 && substr_count(last($data[ $i ][ 'children' ])[ 'content' ],
'<b>') == 0 && ! isset(last($data[ $i ][ 'children' ])[ 'numbering' ]))) && ctype_upper((substr($data[ $j ][ 'content' ],
0, 1)))) {
break;
}
if (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && $data[ $j ][ 'numbering' ] + 1 == $data[ $i ][ 'numbering' ] && str_word_count($data[ $j ][ 'content' ]) < 6) {
break;
}
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ! isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && isset($data[ $j ][ 'numbering' ])) {
break;
}
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (! in_array(trim(strtolower(strip_tags($data[ $j ][ 'content' ]))),
['definitions']) && ! ctype_space($data[ $j ][ 'content' ]) && strlen(trim(strip_tags($data[ $j ][ 'content' ]))) && ! isset($data[ $i ][ 'numbering' ]) && ! isset($data[ $j ][ 'numbering' ])) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ])) {
if (is_numeric($data[ $j ][ 'numbering' ]) && is_numeric($data[ $i ][ 'numbering' ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) == 1 && str_word_count($data[ $j ][ 'content' ]) < str_word_count($data[ $i ][ 'content' ])) {
break;
}
if (is_numeric($data[ $j ][ 'numbering' ]) && abs($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ]) === 1 && (isset($data[ $i ][ 'children' ]) && (! (isset(last($data[ $i ][ 'children' ])[ 'numbering' ])) || (isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && abs(last($data[ $i ][ 'children' ])[ 'numbering' ] - $data[ $j ][ 'numbering' ]) !== 1))) && str_word_count($data[ $j ][ 'content' ]) < 8) {
break;
}
if (substr_count($data[ $j ][ 'numbering' ], '.') > substr_count($data[ $i ][ 'numbering' ],
'.') && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) < 1) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (((float) $data[ $j ][ 'numbering' ] > (float) $data[ $i ][ 'numbering' ] && substr_count($data[ $j ][ 'content' ],
'<b>') == 0 && substr_count($data[ $i ][ 'content' ],
'<b>') == 1) || (substr_count($data[ $i ][ 'content' ],
"<b>") == 1 && (substr_count($data[ $j ][ 'content' ],
'<b>') == 0 || substr_count($data[ $j ][ 'content' ], '<b>')) > 1)) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (substr_count($data[ $i ][ 'content' ],
'<b>') == 1 && str_word_count($data[ $j ][ 'content' ]) > 6 && isset($data[ $j ][ 'numbering' ])) {
if (strpos($data[ $j ][ 'content' ],
'Networking infrastructure (hardware, firmware, software an') !== false) {
dd('aa');
}
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ])) {
$lastParentChild = last($data[ $i ][ 'children' ]);
if (isset($lastParentChild[ 'numbering' ]) && abs($lastParentChild[ 'numbering' ] - $data[ $j ][ 'numbering' ]) === 1 && (substr_count($data[ $j ][ 'content' ],
'<b>') == 1)) {
break;
}
}
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (isset($data[ $i ][ 'numbering' ]) && abs($data[ $i ][ 'numbering' ] - $data[ $j ][ 'numbering' ]) === 1 && str_word_count($data[ $j ][ 'content' ]) >= 6) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ]) && isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && abs((float) $data[ $j ][ 'numbering' ] - (float) last($data[ $i ][ 'children' ])[ 'numbering' ]) == (float) 1) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (isset($data[ $i ][ 'numbering' ]) && abs($data[ $i ][ 'numbering' ] - $data[ $j ][ 'numbering' ]) == 0 && str_word_count($data[ $j ][ 'content' ]) >= 6) {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} else {
break;
}
} elseif (isset($data[ $i ][ 'numbering' ]) && ! isset($data[ $j ][ 'numbering' ]) && str_word_count($data[ $j ][ 'content' ]) > 6) {
if (substr_count($data[ $j ][ 'content' ],
"<b>") == 1 && strpos(strtolower($data[ $i ][ 'content' ]),
'definition') === false) {
break;
}
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} elseif (empty($data[ $j ][ 'content' ]) && (isset($data[ $j + 1 ]) && isset($data[ $j - 1 ]) && isset($data[ $i ][ 'children' ]))) {
if (isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && strlen(last($data[ $i ][ 'children' ])[ 'numbering' ]) == strlen(preg_replace('/[^0-9\.)]/',
'', substr(trim(preg_replace('/ +/', ' ', preg_replace('/[^A-Za-z0-9 .]/', ' ',
urldecode(strip_tags($data[ $j + 1 ][ 'content' ]))))), 0,
5))) && ! empty($data[ $j ][ 'content' ])) {
dd('Here', $data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
} else {
break;
}
} elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ])) {
$lastParentChild = last($data[ $i ][ 'children' ]);
if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && substr_count($lastParentChild[ 'numbering' ],
'.') > substr_count($data[ $j ][ 'numbering' ], '.')) {
dd('111');
} else {
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
$alreadyHandled[] = $j;
}
} else {
break;
}
//if(strpos($data[$i]['content'],'<b>2. TERMS OF THE {P1_Pros}.</b>')!==false || strpos($data[$j]['content'],'<b>2. TERMS OF THE {P1_Pros}.</b>')!==false){
// dd($data[$i],$data[$j]);
//}
}
}
}
if (strlen(trim(strip_tags($data[ $i ][ 'content' ])))) {
$response[] = $data[ $i ];
//if ($data[ $i ][ 'content' ] == "Duration of Agreement and Supply") {
// dd(121,$data[$i],$i);
//}
//if($i > 73){
// dd($i,$data[$i],$response);
//}
}
$alreadyHandled[] = $i;
}
return $response;
}
private function handlePossibleChild($parent, $child)
{
if (empty($parent[ 'content' ]) && ! empty($child[ 'content' ])) {
return $child;
}
if (empty($child[ 'content' ])) {
return $parent;
}
// Must iterate through parent children
if (! isset($parent[ 'children' ]) || (isset($parent[ 'children' ]) && count($parent[ 'children' ]) == 0)) {
$parent[ 'children' ] = [];
if (str_word_count(strip_tags($child[ 'content' ])) >= 5 && strpos($child[ 'content' ], '<b>') === false) {
$parent[ 'children' ][] = $child;
} elseif (strpos($parent[ 'content' ], '<b>') !== false && strpos($child[ 'content' ], '<b>') !== false) {
$parent[ 'children' ][] = $child;
} elseif (isset($child[ 'content' ])) {
$parent[ 'children' ][] = $child;
}
return $parent;
}
$lastParentChild = last($parent[ 'children' ]);
if ($lastParentChild && substr($lastParentChild[ 'content' ],
strlen($lastParentChild[ 'content' ]) - 1) === ':' && ((ctype_lower(substr($child[ 'content' ], 0,
1)) || (ctype_digit(substr($child[ 'content' ], 0,
1)) && str_word_count($child[ 'content' ]) > 5)))) {
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && $child[ 'numbering' ] - 1 == $lastParentChild[ 'numbering' ]) {
$parent[ 'children' ][] = $child;
} else {
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
}
return $parent;
}
if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && strlen($child[ 'numbering' ]) > strlen($lastParentChild[ 'numbering' ])) {
if (isset($parent[ 'children' ]) && isset(last($parent[ 'children' ])[ 'numbering' ]) && $child[ 'numbering' ]) {
if (is_numeric($child[ 'numbering' ]) && abs($child[ 'numbering' ] - $lastParentChild[ 'numbering' ]) === 1) {
$parent[ 'children' ][] = $child;
return $parent;
}
}
if (isset($child[ 'numbering' ]) && isset($lastParentChild[ 'numbering' ]) && substr_count($lastParentChild[ 'numbering' ],
'.') == substr_count($child[ 'numbering' ], '.')) {
$parent[ 'children' ][] = $child;
return $parent;
}
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
return $parent;
}
if (! in_array(substr(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ])),
strlen(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ]))) - 1),
['!', '.', '?', ';', '_', ':']) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
1)) || ((ctype_upper(substr(trim($child[ 'content' ]), 0,
1)) && ! isset($child[ 'numbering' ]))))) {
//dd($lastParentChild,$child);
if (strpos($lastParentChild[ 'content' ],
'e, this Agreement and the {P1_Name} Software Licence Agreement') !== false) {
dd('aa', $lastParentChild, $child);
}
$lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
return $parent;
} elseif (! in_array(substr(trim($parent[ 'content' ]), strlen(trim($parent[ 'content' ])) - 1),
['!', '.', '?', ';']) && ctype_lower(substr(trim($lastParentChild[ 'content' ]),
strlen(trim($lastParentChild[ 'content' ])) - 1)) && ctype_lower(substr(trim($child[ 'content' ]), 0,
1))) {
$parent[ 'children' ][] = $child;
} elseif (! in_array(substr(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ])),
strlen(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ]))) - 1), [
'!',
'.',
'?',
';',
'_',
':'
]) && isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && $lastParentChild[ 'numbering' ] > $child[ 'numbering' ]) {
$lastParentChild[ 'children' ][] = $child;
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
} else {
$parent[ 'children' ][] = $child;
}
return $parent;
}
public function handleTestHtml($array)
{
$data = [];
foreach ($array as $item) {
if (count($item) == 1 && is_array(last($item))) {
return $this->handleTestHtml($item);
} else {
$html = $this->buildParagraphs($item);
if (! isset($data[ 'content' ]) && count($html) > 1) {
$data = array_merge($data, $html);
} elseif ($html) {
$data = $html;
}
}
}
return $data;
}
private function buildParagraphs($paragraphs)
{
$result = [];
$alreadyHandled = [];
for ($i = 0; $i < count($paragraphs); $i++) {
if (array_key_exists($i, $alreadyHandled)) {
continue;
}
$paragraph = $paragraphs[ $i ];
if (is_array($paragraph)) {
$result = array_merge($result, $this->buildParagraphs($paragraph));
} elseif (strlen($paragraph) && ! ctype_space($paragraph)) {
$cleanHtml = trim(str_replace('<b> </b>', '',
preg_replace('/<([^>\s]+)[^>]*>(?:\s*(?:<br \/>|&nbsp;|&thinsp;|&ensp;|&emsp;|&#8201;|&#8194;|&#8195;)\s*)*<\/\1>/',
'', preg_replace('/(<font[^>]*>)|(<\/font>)/', '', preg_replace('/\s+/S', " ", $paragraph)))));
if (! empty($cleanHtml)) {
$result[] = ['content' => html_entity_decode($cleanHtml, ENT_COMPAT | ENT_HTML401, 'UTF-8')];
}
}
}
return $result;
}
/*
* Get text between html tag
*/
private function getTextBetweenTags($string, $tagname)
{
$pattern = "/<$tagname ?.*>(.*)<\/$tagname>/";
preg_match($pattern, str_replace(['<u>', '</u>'], '', $string), $matches);
if ($matches) {
return last($matches);
}
return '';
}
private function paragraphBrake($paragraph, array $breakPoints)
{
//$paragraph[ 'content' ] = '2) <b>TERMS OF THE {P1_Pros}.</b> Subject to the terms of the Agreement, {P1_Name} grants Customer and/or its Affiliates a non-exclusive, non-transferable (except to a successor in interest as permitted hereunder) license to use the {P1_Pros} listed on the <u>Order Form</u> during the Term. Customer\’s and/or its Affiliates\’ right to use the {P1_Pros} is limited to the volume and other restrictions contained herein and in the Order Form and the Documentation.';
//$paragraph[ 'numbering' ] = '2';
preg_replace('/<b ?.*>(\d+)<\/b>/', $paragraph[ 'content' ], $paragraph[ 'content' ]);
preg_replace('/(\d+)\)/', $paragraph[ 'content' ], $paragraph[ 'content' ]);
if (isset($paragraph[ 'numbering' ])) {
$paragraph[ 'content' ] = str_replace(['.', ')', $paragraph[ 'numbering' ]], '', $paragraph[ 'content' ]);
}
if (substr_count($paragraph[ 'content' ], '</b>') === 1) {
$breakString = explode('</b>', $paragraph[ 'content' ]);
if ($breakString) {
$breakString = trim(str_replace('<b>', '', trim($breakString[ 0 ])));
if (in_array($breakString, $breakPoints)) {
return true;
}
}
}
return false;
}
}