You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
670 lines
32 KiB
670 lines
32 KiB
<?php
|
|
|
|
namespace App\Parser;
|
|
|
|
use Illuminate\Support\Facades\Log;
|
|
|
|
class ParseHtmlArray
|
|
{
|
|
|
|
public function fromFile($filePath)
|
|
{
|
|
if (file_exists($filePath)) {
|
|
$fileContent = file_get_contents($filePath);
|
|
$fileContent = str_replace('},
|
|
|
|
]', "}
|
|
|
|
]", $fileContent);
|
|
return $this->handle(json_decode($fileContent,true));
|
|
} else {
|
|
Log::error('The given file dose not exists!');
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public function handle($docxAsHtmlArray)
|
|
{
|
|
$response=[];
|
|
foreach ($docxAsHtmlArray as $i => $array) {
|
|
|
|
$response = array_merge($response, $this->handleTestHtml($array));
|
|
}
|
|
|
|
return $this->buildTheStructure($response);
|
|
|
|
}
|
|
|
|
|
|
private function buildTheStructure($data)
|
|
{
|
|
$response = [];
|
|
$alreadyHandled = [];
|
|
$numbers = [];
|
|
for ($i = 0; $i < count($data); $i++) {
|
|
|
|
if (array_key_exists($i, $alreadyHandled)) {
|
|
continue;
|
|
}
|
|
|
|
$parent = $data[ $i ];
|
|
//get numbering from first 10 chars of the string
|
|
preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
|
|
substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '', preg_replace('/\)/', '.',
|
|
preg_replace("/\{.+/", "", html_entity_decode($data[ $i ][ 'content' ]))))), 0, 5)),
|
|
$parentNumbering);
|
|
|
|
if ($parentNumbering && count($numbers) == 0 && last($parentNumbering) < 5) {
|
|
$numbers[] = $parentNumbering[ 0 ];
|
|
$data[ $i ][ 'numbering' ] = rtrim($parentNumbering[ 0 ], '.');
|
|
} elseif ($parentNumbering && count($numbers) > 0 && $parentNumbering[ 0 ] >= last($numbers)) {
|
|
$numbers[] = $parentNumbering[ 0 ];
|
|
$data[ $i ][ 'numbering' ] = rtrim($parentNumbering[ 0 ], '.');
|
|
}
|
|
|
|
//check if string starts with bold
|
|
//check if number of bolds equals to 1
|
|
//check if not empty html and contains words
|
|
|
|
if ((strpos($parent[ 'content' ], "<b>") === 0 || (substr_count($parent[ 'content' ],
|
|
"<b>") == 1 || $parentNumbering) && strlen(trim(strip_tags($parent[ 'content' ]))) > 0) || (str_word_count(preg_replace('/[A-Za-z]{4,}/',
|
|
'', strip_tags($data[ $i ][ 'content' ]))) < 2)) {
|
|
$childNumbers = [];
|
|
|
|
$j = $i + 1;
|
|
//check if data exists
|
|
if (isset($data[ $j ]) && strlen($data[ $j ][ 'content' ])) {
|
|
|
|
|
|
for ($j; $j < count($data); $j++) {
|
|
if ($data[ $j ][ 'content' ] == '\u00a0') {
|
|
$alreadyHandled[] = $j;
|
|
}
|
|
if (array_key_exists($j, $alreadyHandled)) {
|
|
continue;
|
|
}
|
|
|
|
$child = $data[ $j ];
|
|
|
|
preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/',
|
|
substr(trim(urldecode(str_replace(['<b>', '</b>'], '',
|
|
strip_tags($data[ $j ][ 'content' ])))), 0, 5), $childNumbering);
|
|
|
|
if ($childNumbering && ! preg_match("/[a-z]/i", rtrim(trim($childNumbering[ 0 ])))) {
|
|
if ($childNumbering && count($childNumbers) == 0 && trim($childNumbering[ 0 ]) < 5) {
|
|
$childNumbers[] = trim($childNumbering[ 0 ]);
|
|
$data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
|
|
|
|
} elseif ($childNumbering && count($childNumbers) > 0 && trim($childNumbering[ 0 ]) >= last($childNumbers)) {
|
|
$childNumbers[] = trim($childNumbering[ 0 ]);
|
|
$data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
|
|
|
|
} elseif ($childNumbering && trim($childNumbering[ 0 ]) < 100) {
|
|
$childNumbers[] = trim($childNumbering[ 0 ]);
|
|
$data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
|
|
}
|
|
}
|
|
|
|
if (empty(trim($data[ $i ][ 'content' ])) && isset($data[ $j ][ 'numbering' ])) {
|
|
break;
|
|
}
|
|
|
|
$breakPoints = array_change_key_case([
|
|
'TERMS OF THE {P1_Pros}',
|
|
'TERMS AND CONDITIONS',
|
|
'BACKGROUND',
|
|
'OPERATIVE PROVISIONS',
|
|
'Products and/or Services',
|
|
'PAYMENT',
|
|
'GRANT OF LICENCE',
|
|
'TERM OF LICENCE AGREEMENT',
|
|
'ROYALTY',
|
|
'PAYMENT',
|
|
'PERFORMANCE TARGETS',
|
|
'STATIONERY',
|
|
'QUALITY CONTROL',
|
|
'THE DISTRIBUTOR\'S OBLIGATIONS',
|
|
'NON SOLICITATION',
|
|
'SALE OF BUSINESS',
|
|
'TERMINATION OF AGREEMENT',
|
|
'CONDITIONS FOLLOWING TERMINATION',
|
|
'RESTRAINT',
|
|
'TIME OF ESSENCE AND NOTICES',
|
|
'INTERPRETATION',
|
|
'ARBITRATION',
|
|
'DOMICILIUM AND REGISTERED OFFICE',
|
|
'USE OF TRADE MARKS, TRADE NAME, GOODWILL AND KNOW-HOW',
|
|
'GENERAL',
|
|
'DESCRIPTION OF {P2_NAME} INFORMATION',
|
|
'PAYMENT OF FEES',
|
|
'SUPPLIER\'S STATUS',
|
|
'SUPPLIER\’S OBLIGATIONS',
|
|
'DEFINITIONS AND INTERPRETATION',
|
|
'DEFINITIONS',
|
|
'CONFIDENTIALITY',
|
|
'TERMINATION',
|
|
'RESTRICTIVE COVENANTS AND INTELLECTUAL PROPERTY',
|
|
'DETAILS AND IDENTITY OF CONSULTANT',
|
|
'ANTI-BRIBERY',
|
|
'ASSIGNMENT SCHEDULE',
|
|
'SCHEDULE 1',
|
|
'{P1_NAME}\'S LIABILITY',
|
|
'DURATION OF AGREEMENT AND SUPPLY',
|
|
'SUPPLY OF HARDWARE',
|
|
'SUPPLY OF SOFTWARE AND DOCUMENTATION',
|
|
'SUPPLY OF SUPPORT SERVICES',
|
|
'INTELLECTUAL PROPERTY RIGHTS',
|
|
'THE CONTRACT',
|
|
'{P1_NAME}\U2019S LIABILITY',
|
|
'UPDATES',
|
|
'TERMS OF THE {P1_NAME} PRODUCTS.',
|
|
'CUSTOMER RESPONSIBILITIES',
|
|
'EXHIBIT A',
|
|
'EXHIBIT A-1',
|
|
'EXHIBIT A-2',
|
|
'WARRANTIES',
|
|
'EXIT, TERMINATION AND SUSPENSION',
|
|
'EXHIBIT B',
|
|
'EXHIBIT B-1',
|
|
'EXHIBIT B-2',
|
|
'COUNTERPARTS',
|
|
'LICENSE GRANT',
|
|
'INDEMNIFICATION BY CUSTOMER',
|
|
'TERMS OF THE {P1_NAME} PRODUCTS',
|
|
'TERMS OF CLOUD SERVICE',
|
|
'INDEMNIFICATION BY CUSTOMER',
|
|
'TERMINATION',
|
|
'TERMS OF THE {P1_PROS}',
|
|
'SUPPORT',
|
|
'SUB CONTRACTING AND THIRD PARTY RECOMMENDATIONS',
|
|
'LICENCE AND ACCESS TO SOFTWARE AND HARDWARE',
|
|
'DECLARATION OF NON-LIAISON AND ANTI-CORRUPTION COMMITMENT',
|
|
'{P1_NAME}\'S DUTIES'
|
|
], CASE_UPPER);
|
|
//$breakPoints = [];
|
|
|
|
if ($this->paragraphBrake($data[ $j ], $breakPoints)) {
|
|
break;
|
|
}
|
|
|
|
if (substr(trim(str_replace(array_merge([')'], $childNumbering), '', $data[ $j ][ 'content' ])),
|
|
0, 3) == '<b>' && str_word_count(strip_tags(str_replace(array_merge([')'],
|
|
$childNumbering), '',
|
|
$data[ $j ][ 'content' ]))) == str_word_count($this->getTextBetweenTags(str_replace(array_merge([')',],
|
|
$childNumbering), '', $data[ $j ][ 'content' ]),
|
|
'b')) && (isset($data[ $j + 1 ]) && ((ctype_upper(substr($data[ $j + 1 ][ 'content' ],
|
|
0,
|
|
1)) || (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && $data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ] == 1))))) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ctype_upper(str_replace(' ',
|
|
'', $data[ $j ][ 'content' ])) && str_word_count($data[ $j ][ 'content' ]) >= 1) {
|
|
|
|
break;
|
|
}
|
|
|
|
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ctype_upper(str_replace([
|
|
'<b>',
|
|
'</b>',
|
|
last($childNumbering),
|
|
last($childNumbering),
|
|
')',
|
|
'.'
|
|
], '', trim(str_replace(' ', '',
|
|
$data[ $j ][ 'content' ])))) && str_word_count($data[ $j ][ 'content' ]) >= 1) {
|
|
|
|
break;
|
|
}
|
|
|
|
//if(isset($data[$j]['numbering']) && isset($data[$i]['numbering']) && )
|
|
|
|
if (isset($data[ $i ][ 'children' ]) && isset($data[ $i ][ 'numbering' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ]) && isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && ($data[ $j ][ 'numbering' ] - last($data[ $i ][ 'children' ])[ 'numbering' ] !== 1 && $data[ $i ][ 'numbering' ] < $data[ $j ][ 'numbering' ]) && ! in_array(substr(strip_tags(last($data[ $i ][ 'children' ])[ 'content' ]),
|
|
strlen(strip_tags(last($data[ $i ][ 'children' ])[ 'content' ])) - 1),
|
|
[':', '-']) && ! strpos($data[ $j ][ 'numbering' ], '.')) {
|
|
|
|
|
|
break;
|
|
}
|
|
|
|
if (in_array(strtoupper(trim(str_replace([
|
|
'<b>',
|
|
'</b>',
|
|
last($parentNumbering),
|
|
last($parentNumbering),
|
|
')',
|
|
'.'
|
|
], '', strip_tags($data[ $i ][ 'content' ])))), $breakPoints)) {
|
|
if ((! isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && (substr($data[ $i ][ 'content' ],
|
|
0,
|
|
3) != '<b>') || (str_word_count(strip_tags($data[ $i ][ 'content' ])) != str_word_count($this->getTextBetweenTags($data[ $i ][ 'content' ],
|
|
'b'))))) {
|
|
if (! in_array($data[ $i ][ 'content' ], $breakPoints)) {
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (in_array(strtoupper(trim(str_replace([
|
|
'<b>',
|
|
'</b>',
|
|
last($childNumbering),
|
|
last($childNumbering),
|
|
')',
|
|
'.'
|
|
], '', strip_tags($data[ $j ][ 'content' ])))), $breakPoints)) {
|
|
break;
|
|
}
|
|
|
|
if (in_array(substr(strip_tags($data[ $j ][ 'content' ]),
|
|
strlen(strip_tags($data[ $j ][ 'content' ])) - 1), [':', '-'])) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ctype_lower(substr(last($data[ $i ][ 'children' ])[ 'content' ],
|
|
strlen(last($data[ $i ][ 'children' ])[ 'content' ]) - 1)) && ctype_lower(substr(trim($data[ $j ][ 'content' ]),
|
|
0, 1))) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} elseif (str_word_count(preg_replace('/[A-Za-z]{4,}/', '',
|
|
strip_tags($data[ $j ][ 'content' ]))) < 3 && strlen(strip_tags($data[ $j ][ 'content' ])) && ! isset($data[ $j ][ 'numbering' ]) && ctype_upper(substr($data[ $j ][ 'content' ],
|
|
0, 1)) && str_word_count($data[ $j ][ 'content' ]) < 10) {
|
|
|
|
if (isset($data[ $i ][ 'children' ]) && ! in_array(substr(trim(last($data[ $i ][ 'children' ])[ 'content' ]),
|
|
strlen(trim(last($data[ $i ][ 'children' ])[ 'content' ])) - 1),
|
|
['!', '.', '?', '_', '}'])) {
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} else {
|
|
break;
|
|
|
|
}
|
|
|
|
//dd($data[$i]);
|
|
} elseif (str_word_count(preg_replace('/[A-Za-z]{4,}/', '',
|
|
strip_tags($data[ $i ][ 'content' ]))) < 2 && strlen(strip_tags($data[ $i ][ 'content' ]))) {
|
|
|
|
|
|
if (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && is_numeric($data[ $j ][ 'numbering' ]) && abs($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ]) == 1 && str_word_count($data[ $j ]
|
|
[ 'content' ]) < 6) {
|
|
|
|
break;
|
|
}
|
|
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ((str_word_count($data[ $j ]
|
|
[ 'content' ]) < 6) || (substr_count($data[ $j ][ 'content' ],
|
|
'<b>') == 1 && substr_count(last($data[ $i ][ 'children' ])[ 'content' ],
|
|
'<b>') == 0 && ! isset(last($data[ $i ][ 'children' ])[ 'numbering' ]))) && ctype_upper((substr($data[ $j ][ 'content' ],
|
|
0, 1)))) {
|
|
break;
|
|
}
|
|
if (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && $data[ $j ][ 'numbering' ] + 1 == $data[ $i ][ 'numbering' ] && str_word_count($data[ $j ][ 'content' ]) < 6) {
|
|
break;
|
|
}
|
|
|
|
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ! isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && isset($data[ $j ][ 'numbering' ])) {
|
|
|
|
break;
|
|
}
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
|
|
$alreadyHandled[] = $j;
|
|
} elseif (! in_array(trim(strtolower(strip_tags($data[ $j ][ 'content' ]))),
|
|
['definitions']) && ! ctype_space($data[ $j ][ 'content' ]) && strlen(trim(strip_tags($data[ $j ][ 'content' ]))) && ! isset($data[ $i ][ 'numbering' ]) && ! isset($data[ $j ][ 'numbering' ])) {
|
|
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} elseif (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ])) {
|
|
|
|
|
|
if (is_numeric($data[ $j ][ 'numbering' ]) && is_numeric($data[ $i ][ 'numbering' ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) == 1 && str_word_count($data[ $j ][ 'content' ]) < str_word_count($data[ $i ][ 'content' ])) {
|
|
break;
|
|
}
|
|
|
|
if (is_numeric($data[ $j ][ 'numbering' ]) && abs($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ]) === 1 && (isset($data[ $i ][ 'children' ]) && (! (isset(last($data[ $i ][ 'children' ])[ 'numbering' ])) || (isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && abs(last($data[ $i ][ 'children' ])[ 'numbering' ] - $data[ $j ][ 'numbering' ]) !== 1))) && str_word_count($data[ $j ][ 'content' ]) < 8) {
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (substr_count($data[ $j ][ 'numbering' ], '.') > substr_count($data[ $i ][ 'numbering' ],
|
|
'.') && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) < 1) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} elseif (((float) $data[ $j ][ 'numbering' ] > (float) $data[ $i ][ 'numbering' ] && substr_count($data[ $j ][ 'content' ],
|
|
'<b>') == 0 && substr_count($data[ $i ][ 'content' ],
|
|
'<b>') == 1) || (substr_count($data[ $i ][ 'content' ],
|
|
"<b>") == 1 && (substr_count($data[ $j ][ 'content' ],
|
|
'<b>') == 0 || substr_count($data[ $j ][ 'content' ], '<b>')) > 1)) {
|
|
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} elseif (substr_count($data[ $i ][ 'content' ],
|
|
'<b>') == 1 && str_word_count($data[ $j ][ 'content' ]) > 6 && isset($data[ $j ][ 'numbering' ])) {
|
|
if (strpos($data[ $j ][ 'content' ],
|
|
'Networking infrastructure (hardware, firmware, software an') !== false) {
|
|
dd('aa');
|
|
}
|
|
|
|
if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ])) {
|
|
$lastParentChild = last($data[ $i ][ 'children' ]);
|
|
if (isset($lastParentChild[ 'numbering' ]) && abs($lastParentChild[ 'numbering' ] - $data[ $j ][ 'numbering' ]) === 1 && (substr_count($data[ $j ][ 'content' ],
|
|
'<b>') == 1)) {
|
|
|
|
break;
|
|
}
|
|
|
|
}
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} elseif (isset($data[ $i ][ 'numbering' ]) && abs($data[ $i ][ 'numbering' ] - $data[ $j ][ 'numbering' ]) === 1 && str_word_count($data[ $j ][ 'content' ]) >= 6) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ]) && isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && abs((float) $data[ $j ][ 'numbering' ] - (float) last($data[ $i ][ 'children' ])[ 'numbering' ]) == (float) 1) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} elseif (isset($data[ $i ][ 'numbering' ]) && abs($data[ $i ][ 'numbering' ] - $data[ $j ][ 'numbering' ]) == 0 && str_word_count($data[ $j ][ 'content' ]) >= 6) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} else {
|
|
|
|
break;
|
|
}
|
|
|
|
} elseif (isset($data[ $i ][ 'numbering' ]) && ! isset($data[ $j ][ 'numbering' ]) && str_word_count($data[ $j ][ 'content' ]) > 6) {
|
|
|
|
if (substr_count($data[ $j ][ 'content' ],
|
|
"<b>") == 1 && strpos(strtolower($data[ $i ][ 'content' ]),
|
|
'definition') === false) {
|
|
|
|
break;
|
|
}
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} elseif (empty($data[ $j ][ 'content' ]) && (isset($data[ $j + 1 ]) && isset($data[ $j - 1 ]) && isset($data[ $i ][ 'children' ]))) {
|
|
|
|
if (isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && strlen(last($data[ $i ][ 'children' ])[ 'numbering' ]) == strlen(preg_replace('/[^0-9\.)]/',
|
|
'', substr(trim(preg_replace('/ +/', ' ', preg_replace('/[^A-Za-z0-9 .]/', ' ',
|
|
urldecode(strip_tags($data[ $j + 1 ][ 'content' ]))))), 0,
|
|
5))) && ! empty($data[ $j ][ 'content' ])) {
|
|
dd('Here', $data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} else {
|
|
|
|
break;
|
|
}
|
|
|
|
} elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ])) {
|
|
|
|
$lastParentChild = last($data[ $i ][ 'children' ]);
|
|
if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && substr_count($lastParentChild[ 'numbering' ],
|
|
'.') > substr_count($data[ $j ][ 'numbering' ], '.')) {
|
|
dd('111');
|
|
|
|
} else {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
}
|
|
} else {
|
|
|
|
break;
|
|
}
|
|
|
|
//if(strpos($data[$i]['content'],'<b>2. TERMS OF THE {P1_Pros}.</b>')!==false || strpos($data[$j]['content'],'<b>2. TERMS OF THE {P1_Pros}.</b>')!==false){
|
|
// dd($data[$i],$data[$j]);
|
|
//}
|
|
}
|
|
}
|
|
|
|
}
|
|
|
|
if (strlen(trim(strip_tags($data[ $i ][ 'content' ])))) {
|
|
|
|
$response[] = $data[ $i ];
|
|
//if ($data[ $i ][ 'content' ] == "Duration of Agreement and Supply") {
|
|
// dd(121,$data[$i],$i);
|
|
//}
|
|
//if($i > 73){
|
|
// dd($i,$data[$i],$response);
|
|
//}
|
|
}
|
|
|
|
$alreadyHandled[] = $i;
|
|
}
|
|
|
|
return $response;
|
|
}
|
|
|
|
|
|
private function handlePossibleChild($parent, $child)
|
|
{
|
|
|
|
|
|
if (empty($parent[ 'content' ]) && ! empty($child[ 'content' ])) {
|
|
return $child;
|
|
}
|
|
if (empty($child[ 'content' ])) {
|
|
return $parent;
|
|
}
|
|
|
|
// Must iterate through parent children
|
|
if (! isset($parent[ 'children' ]) || (isset($parent[ 'children' ]) && count($parent[ 'children' ]) == 0)) {
|
|
|
|
$parent[ 'children' ] = [];
|
|
if (str_word_count(strip_tags($child[ 'content' ])) >= 5 && strpos($child[ 'content' ], '<b>') === false) {
|
|
$parent[ 'children' ][] = $child;
|
|
} elseif (strpos($parent[ 'content' ], '<b>') !== false && strpos($child[ 'content' ], '<b>') !== false) {
|
|
$parent[ 'children' ][] = $child;
|
|
} elseif (isset($child[ 'content' ])) {
|
|
$parent[ 'children' ][] = $child;
|
|
}
|
|
|
|
return $parent;
|
|
}
|
|
|
|
$lastParentChild = last($parent[ 'children' ]);
|
|
|
|
if ($lastParentChild && substr($lastParentChild[ 'content' ],
|
|
strlen($lastParentChild[ 'content' ]) - 1) === ':' && ((ctype_lower(substr($child[ 'content' ], 0,
|
|
1)) || (ctype_digit(substr($child[ 'content' ], 0,
|
|
1)) && str_word_count($child[ 'content' ]) > 5)))) {
|
|
|
|
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
|
|
if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && $child[ 'numbering' ] - 1 == $lastParentChild[ 'numbering' ]) {
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
} else {
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
}
|
|
|
|
return $parent;
|
|
|
|
}
|
|
|
|
if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && strlen($child[ 'numbering' ]) > strlen($lastParentChild[ 'numbering' ])) {
|
|
|
|
if (isset($parent[ 'children' ]) && isset(last($parent[ 'children' ])[ 'numbering' ]) && $child[ 'numbering' ]) {
|
|
|
|
if (is_numeric($child[ 'numbering' ]) && abs($child[ 'numbering' ] - $lastParentChild[ 'numbering' ]) === 1) {
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
return $parent;
|
|
}
|
|
}
|
|
|
|
if (isset($child[ 'numbering' ]) && isset($lastParentChild[ 'numbering' ]) && substr_count($lastParentChild[ 'numbering' ],
|
|
'.') == substr_count($child[ 'numbering' ], '.')) {
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
return $parent;
|
|
}
|
|
|
|
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
|
|
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
|
|
}
|
|
|
|
if (! in_array(substr(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ])),
|
|
strlen(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ]))) - 1),
|
|
['!', '.', '?', ';', '_', ':']) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
|
|
1)) || ((ctype_upper(substr(trim($child[ 'content' ]), 0,
|
|
1)) && ! isset($child[ 'numbering' ]))))) {
|
|
//dd($lastParentChild,$child);
|
|
if (strpos($lastParentChild[ 'content' ],
|
|
'e, this Agreement and the {P1_Name} Software Licence Agreement') !== false) {
|
|
dd('aa', $lastParentChild, $child);
|
|
}
|
|
$lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
} elseif (! in_array(substr(trim($parent[ 'content' ]), strlen(trim($parent[ 'content' ])) - 1),
|
|
['!', '.', '?', ';']) && ctype_lower(substr(trim($lastParentChild[ 'content' ]),
|
|
strlen(trim($lastParentChild[ 'content' ])) - 1)) && ctype_lower(substr(trim($child[ 'content' ]), 0,
|
|
1))) {
|
|
|
|
|
|
$parent[ 'children' ][] = $child;
|
|
} elseif (! in_array(substr(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ])),
|
|
strlen(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ]))) - 1), [
|
|
'!',
|
|
'.',
|
|
'?',
|
|
';',
|
|
'_',
|
|
':'
|
|
]) && isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && $lastParentChild[ 'numbering' ] > $child[ 'numbering' ]) {
|
|
$lastParentChild[ 'children' ][] = $child;
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
} else {
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
}
|
|
|
|
return $parent;
|
|
}
|
|
|
|
|
|
public function handleTestHtml($array)
|
|
{
|
|
$data = [];
|
|
foreach ($array as $item) {
|
|
|
|
if (count($item) == 1 && is_array(last($item))) {
|
|
|
|
return $this->handleTestHtml($item);
|
|
|
|
} else {
|
|
$html = $this->buildParagraphs($item);
|
|
|
|
if (! isset($data[ 'content' ]) && count($html) > 1) {
|
|
$data = array_merge($data, $html);
|
|
} elseif ($html) {
|
|
|
|
$data = $html;
|
|
}
|
|
|
|
}
|
|
}
|
|
|
|
return $data;
|
|
}
|
|
|
|
|
|
private function buildParagraphs($paragraphs)
|
|
{
|
|
$result = [];
|
|
$alreadyHandled = [];
|
|
for ($i = 0; $i < count($paragraphs); $i++) {
|
|
if (array_key_exists($i, $alreadyHandled)) {
|
|
continue;
|
|
}
|
|
$paragraph = $paragraphs[ $i ];
|
|
if (is_array($paragraph)) {
|
|
$result = array_merge($result, $this->buildParagraphs($paragraph));
|
|
} elseif (strlen($paragraph) && ! ctype_space($paragraph)) {
|
|
|
|
$cleanHtml = trim(str_replace('<b> </b>', '',
|
|
preg_replace('/<([^>\s]+)[^>]*>(?:\s*(?:<br \/>| | | | | | | )\s*)*<\/\1>/',
|
|
'', preg_replace('/(<font[^>]*>)|(<\/font>)/', '', preg_replace('/\s+/S', " ", $paragraph)))));
|
|
|
|
if (! empty($cleanHtml)) {
|
|
$result[] = ['content' => html_entity_decode($cleanHtml, ENT_COMPAT | ENT_HTML401, 'UTF-8')];
|
|
}
|
|
|
|
|
|
}
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
|
|
/*
|
|
* Get text between html tag
|
|
*/
|
|
private function getTextBetweenTags($string, $tagname)
|
|
{
|
|
$pattern = "/<$tagname ?.*>(.*)<\/$tagname>/";
|
|
preg_match($pattern, str_replace(['<u>', '</u>'], '', $string), $matches);
|
|
if ($matches) {
|
|
return last($matches);
|
|
}
|
|
|
|
return '';
|
|
|
|
}
|
|
|
|
|
|
private function paragraphBrake($paragraph, array $breakPoints)
|
|
{
|
|
//$paragraph[ 'content' ] = '2) <b>TERMS OF THE {P1_Pros}.</b> Subject to the terms of the Agreement, {P1_Name} grants Customer and/or its Affiliates a non-exclusive, non-transferable (except to a successor in interest as permitted hereunder) license to use the {P1_Pros} listed on the <u>Order Form</u> during the Term. Customer\’s and/or its Affiliates\’ right to use the {P1_Pros} is limited to the volume and other restrictions contained herein and in the Order Form and the Documentation.';
|
|
//$paragraph[ 'numbering' ] = '2';
|
|
preg_replace('/<b ?.*>(\d+)<\/b>/', $paragraph[ 'content' ], $paragraph[ 'content' ]);
|
|
preg_replace('/(\d+)\)/', $paragraph[ 'content' ], $paragraph[ 'content' ]);
|
|
if (isset($paragraph[ 'numbering' ])) {
|
|
$paragraph[ 'content' ] = str_replace(['.', ')', $paragraph[ 'numbering' ]], '', $paragraph[ 'content' ]);
|
|
}
|
|
if (substr_count($paragraph[ 'content' ], '</b>') === 1) {
|
|
$breakString = explode('</b>', $paragraph[ 'content' ]);
|
|
if ($breakString) {
|
|
$breakString = trim(str_replace('<b>', '', trim($breakString[ 0 ])));
|
|
if (in_array($breakString, $breakPoints)) {
|
|
|
|
return true;
|
|
}
|
|
}
|
|
|
|
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
}
|
|
|