You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
774 lines
29 KiB
774 lines
29 KiB
<?php
|
|
|
|
namespace App\Parser;
|
|
|
|
use Illuminate\Support\Facades\Log;
|
|
|
|
class ParseTextArray
|
|
{
|
|
|
|
/**
|
|
* @var array
|
|
*/
|
|
private $breakPoints = [
|
|
'TERMS OF THE {P1_Pros}',
|
|
'TERMS AND CONDITIONS',
|
|
'BACKGROUND',
|
|
'OPERATIVE PROVISIONS',
|
|
'Products and/or Services',
|
|
'PAYMENT',
|
|
'GRANT OF LICENCE',
|
|
'TERM OF LICENCE AGREEMENT',
|
|
'ROYALTY',
|
|
'PAYMENT',
|
|
'PERFORMANCE TARGETS',
|
|
'STATIONERY',
|
|
'QUALITY CONTROL',
|
|
'THE DISTRIBUTOR\'S OBLIGATIONS',
|
|
'NON SOLICITATION',
|
|
'SALE OF BUSINESS',
|
|
'TERMINATION OF AGREEMENT',
|
|
'CONDITIONS FOLLOWING TERMINATION',
|
|
'RESTRAINT',
|
|
'TIME OF ESSENCE AND NOTICES',
|
|
'INTERPRETATION',
|
|
'ARBITRATION',
|
|
'DOMICILIUM AND REGISTERED OFFICE',
|
|
'USE OF TRADE MARKS, TRADE NAME, GOODWILL AND KNOW-HOW',
|
|
'GENERAL',
|
|
'DESCRIPTION OF {P2_NAME} INFORMATION',
|
|
'PAYMENT OF FEES',
|
|
'SUPPLIER\'S STATUS',
|
|
'SUPPLIER\’S OBLIGATIONS',
|
|
'DEFINITIONS AND INTERPRETATION',
|
|
'DEFINITIONS',
|
|
'CONFIDENTIALITY',
|
|
'TERMINATION',
|
|
'RESTRICTIVE COVENANTS AND INTELLECTUAL PROPERTY',
|
|
'DETAILS AND IDENTITY OF CONSULTANT',
|
|
'ANTI-BRIBERY',
|
|
'ASSIGNMENT SCHEDULE',
|
|
'SCHEDULE 1',
|
|
'{P1_NAME}\'S LIABILITY',
|
|
'DURATION OF AGREEMENT AND SUPPLY',
|
|
'SUPPLY OF HARDWARE',
|
|
'SUPPLY OF SOFTWARE AND DOCUMENTATION',
|
|
'SUPPLY OF SUPPORT SERVICES',
|
|
'INTELLECTUAL PROPERTY RIGHTS',
|
|
'THE CONTRACT',
|
|
'{P1_NAME}\U2019S LIABILITY',
|
|
'UPDATES',
|
|
'TERMS OF THE {P1_NAME} PRODUCTS.',
|
|
'CUSTOMER RESPONSIBILITIES',
|
|
'EXHIBIT A',
|
|
'EXHIBIT A-1',
|
|
'EXHIBIT A-2',
|
|
'WARRANTIES',
|
|
'EXIT, TERMINATION AND SUSPENSION',
|
|
'EXHIBIT B',
|
|
'EXHIBIT B-1',
|
|
'EXHIBIT B-2',
|
|
'COUNTERPARTS',
|
|
'LICENSE GRANT',
|
|
'INDEMNIFICATION BY CUSTOMER',
|
|
'TERMS OF THE {P1_NAME} PRODUCTS',
|
|
'TERMS OF CLOUD SERVICE',
|
|
'INDEMNIFICATION BY CUSTOMER',
|
|
'TERMINATION',
|
|
'TERMS OF THE {P1_PROS}',
|
|
'SUPPORT',
|
|
'SUB CONTRACTING AND THIRD PARTY RECOMMENDATIONS',
|
|
'LICENCE AND ACCESS TO SOFTWARE AND HARDWARE',
|
|
'DECLARATION OF NON-LIAISON AND ANTI-CORRUPTION COMMITMENT',
|
|
'{P1_NAME}\'S DUTIES',
|
|
'ana are mere',
|
|
'definitions',
|
|
'fees',
|
|
'ENGAGEMENT',
|
|
'DUTIES',
|
|
'TERMINATION',
|
|
'STATEMENTS',
|
|
'CONFIDENTIALITY',
|
|
'Human rights',
|
|
'Labour',
|
|
'Environment',
|
|
'Anti-corruption',
|
|
'Services',
|
|
'Scope of the Agreement',
|
|
'Staffing Levels for Services','Indemnification'
|
|
];
|
|
|
|
/**
|
|
* @var bool
|
|
*/
|
|
private $pdf;
|
|
|
|
|
|
/**
|
|
* ParseTextArray constructor.
|
|
*
|
|
* @param bool $pdf
|
|
*/
|
|
public function __construct($pdf = false)
|
|
{
|
|
$this->breakPoints = $this->nestedUppercase($this->breakPoints);
|
|
$this->pdf = $pdf;
|
|
}
|
|
|
|
|
|
public function fromFile($filePath)
|
|
{
|
|
if (file_exists($filePath)) {
|
|
$fileContent = file_get_contents($filePath);
|
|
|
|
return $this->buildTheStructure(array_filter(explode(PHP_EOL, $fileContent)));
|
|
} else {
|
|
Log::error('The given file dose not exists!');
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Build the child structure and extract relevant data from the text content
|
|
*
|
|
*
|
|
* @param $textAsArray
|
|
*
|
|
* @return array
|
|
*/
|
|
private function buildTheStructure($textAsArray)
|
|
{
|
|
$textAsArray = array_values($textAsArray);
|
|
$response = [];
|
|
|
|
$alreadyHandled = [];
|
|
$countData = count($textAsArray);
|
|
|
|
for ($i = 0; $i < $countData; $i++) {
|
|
if (array_key_exists($i, $alreadyHandled)) {
|
|
continue;
|
|
}
|
|
//extract the content and count the number of the empty spaces from the begining
|
|
|
|
$data[ $i ] = [
|
|
'content' => trim($textAsArray[ $i ]),
|
|
'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ]))
|
|
];
|
|
|
|
//Remove numbering from the paragraph content
|
|
if ($numbering = $this->getNumbering($textAsArray[ $i ])) {
|
|
$data[ $i ][ 'numbering' ] = $numbering;
|
|
$data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.'));
|
|
}
|
|
|
|
if ($this->pdf && strpos($textAsArray[ $i ], 'Page') !== false && strpos($textAsArray[ $i ],
|
|
'of') !== false) {
|
|
$alreadyHandled[] = $i;
|
|
break;
|
|
}
|
|
|
|
$j = $i + 1;
|
|
|
|
if (isset($textAsArray[ $j ])) {
|
|
for ($j; $j < $countData; $j++) {
|
|
if (array_key_exists($j, $alreadyHandled)) {
|
|
continue;
|
|
}
|
|
|
|
if (
|
|
$this->pdf &&
|
|
isset($textAsArray[ $j ]) &&
|
|
strpos($textAsArray[ $j ], 'Page') !== false &&
|
|
strpos($textAsArray[ $j ], 'of') !== false
|
|
) {
|
|
$alreadyHandled[] = $j;
|
|
|
|
continue;
|
|
}
|
|
|
|
// Extract the content and count the number of the empty spaces from the beginning
|
|
$data[ $j ] = [
|
|
'content' => trim($textAsArray[ $j ]),
|
|
'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ]))
|
|
];
|
|
|
|
// Remove numbering from the paragraph content
|
|
if ($numbering = $this->getNumbering($textAsArray[ $j ])) {
|
|
$data[ $j ][ 'numbering' ] = $numbering;
|
|
$data[ $j ][ 'content' ] = trim(
|
|
ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), '.')
|
|
);
|
|
}
|
|
|
|
// Break if both have numbering and the space is equal
|
|
if (
|
|
$data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] &&
|
|
$this->hasNumbering($data[ $j ]) &&
|
|
$this->hasNumbering($data[ $i ]) &&
|
|
substr_count($data[ $i ][ 'numbering' ], '.') == substr_count($data[ $j ][ 'numbering' ], '.') &&
|
|
count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric'))) {
|
|
|
|
break;
|
|
}
|
|
|
|
if (
|
|
$this->hasNumbering($data[ $j ]) &&
|
|
! $this->hasNumbering($data[ $i ]) &&
|
|
! $data[ $i ][ 'spaces' ] &&
|
|
$data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] &&
|
|
! in_array(substr($data[ $i ][ 'content' ], -1), [':'])
|
|
) {
|
|
break;
|
|
}
|
|
|
|
if (
|
|
$this->hasNumbering($data[ $j ]) &&
|
|
$this->hasNumbering($data[ $i ]) &&
|
|
((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
|
|
) {
|
|
break;
|
|
}
|
|
|
|
if (
|
|
$this->hasNumbering($data[ $j ]) &&
|
|
$this->hasNumbering($data[ $i ]) &&
|
|
((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
|
|
) {
|
|
|
|
break;
|
|
}
|
|
|
|
// Hardcoded breakpoints
|
|
if (
|
|
$this->hasNumbering($data[ $j ]) &&
|
|
in_array(strtoupper(str_replace(['.', "\t", " "], '', $data[ $j ][ 'content' ])), $this->breakPoints)
|
|
) {
|
|
break;
|
|
}
|
|
|
|
// Hardcoded "Schedule break"
|
|
if (
|
|
! $this->hasNumbering($data[ $j ]) &&
|
|
strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), 0, 10), 'schedule') !== false
|
|
) {
|
|
break;
|
|
}
|
|
|
|
if (
|
|
! $this->hasNumbering($data[ $j ]) &&
|
|
strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), 'Exhibit') !== false &&
|
|
! in_array(substr(trim($data[ $j ][ 'content' ]), -1), ['.'])
|
|
) {
|
|
|
|
break;
|
|
}
|
|
if (strpos(substr(trim(strtolower($data[ $j ][ 'content' ])), 0, 15), 'attachment') !== false) {
|
|
break;
|
|
}
|
|
|
|
if ($this->hasNumbering($data[ $j ]) && $this->hasChild($data[ $i ])) {
|
|
if ($this->hasNumbering(last($data[ $i ][ 'children' ])) && (is_numeric(last($data[ $i ][ 'children' ])[ 'numbering' ]) && strpos(last($data[ $i ][ 'children' ])[ 'numbering' ],
|
|
".") !== false) && (is_numeric($data[ $j ][ 'numbering' ]) && strpos($data[ $j ][ 'numbering' ],
|
|
".") === false)) {
|
|
|
|
break;
|
|
}
|
|
}
|
|
|
|
if ($data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && strlen($data[ $i ][ 'content' ]) && strlen($data[ $j ][ 'content' ])) {
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
|
|
$alreadyHandled[] = $j;
|
|
} elseif (isset($textAsArray[ $j + 1 ]) && $this->paragraphBetweenClauses($data[ $i ], $data[ $j ],
|
|
array_slice($textAsArray, $j + 1))) {
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} elseif ($this->hasChild($data[ $i ]) && $this->lastChildIsList($data[ $i ]) && ($data[ $i ][ 'spaces' ] == 0 || $data[ $i ][ 'spaces' ] > $data[ $j ][ 'spaces' ])) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
|
|
$alreadyHandled[] = $j;
|
|
} elseif ($data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && (substr_count($data[ $i ][ 'numbering' ],
|
|
'.') < substr_count($data[ $j ][ 'numbering' ],
|
|
'.') || count(array_filter(str_split($data[ $i ][ 'numbering' ]),
|
|
'is_numeric')) < count(array_filter(str_split($data[ $j ][ 'numbering' ]),
|
|
'is_numeric')))) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} else {
|
|
|
|
if ($this->paragraphIsList($data[ $i ]) && (ctype_lower(substr($data[ $j ][ 'content' ], 0,
|
|
1)) || in_array(substr($data[ $j ][ 'content' ], 0, 1), ['{', '•']))) {
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} elseif ($this->hasNumbering($data[ $i ]) && $this->hasNumbering($data[ $j ]) && is_numeric($data[ $j ][ 'numbering' ]) && strpos($data[ $j ][ 'numbering' ],
|
|
".") !== false && strpos($data[ $i ][ 'numbering' ],
|
|
".") === false && ! is_int($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ])) {
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} elseif ($this->hasChild($data[ $i ]) && ($data[ $j ][ 'spaces' ] == $this->getLastChildForParagraph($data[ $i ])[ 'spaces' ])) {
|
|
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} elseif (strpos(strtolower($data[ $i ][ 'content' ]),
|
|
'definitions and') !== false && in_array(utf8_encode(substr($data[ $j ][ 'content' ], 0,
|
|
1)), ['â', '"'])) {
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
} elseif ($this->hasChild($data[ $i ]) && $this->paragraphIsList($this->getLastChildFromParagraph($data[ $i ]))) {
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} elseif (($this->hasChild($data[ $i ]) || $data[ $i ][ 'spaces' ] == $data[ $j ][ 'spaces' ]) && ! $this->hasNumbering($this->getLastChildForParagraph($data[ $i ])) && ! $this->hasNumbering($data[ $j ])) {
|
|
$data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
|
|
$alreadyHandled[] = $j;
|
|
|
|
} else {
|
|
|
|
break;
|
|
}
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (strlen($data[ $i ][ 'content' ])) {
|
|
$response[] = $data[ $i ];
|
|
|
|
}
|
|
$alreadyHandled[] = $i;
|
|
|
|
|
|
}
|
|
|
|
return $this->recheckClauses($response);
|
|
}
|
|
|
|
|
|
/**
|
|
* Recheck missed clauses and assign them to a parent if is the case
|
|
*
|
|
* @param $clauses
|
|
*
|
|
* @return array
|
|
*/
|
|
private function recheckClauses($clauses)
|
|
{
|
|
$checkedClauses = [];
|
|
$alreadyManaged = [];
|
|
for ($i = 0; $i < count($clauses); $i++) {
|
|
if (array_key_exists($i, $alreadyManaged)) {
|
|
continue;
|
|
}
|
|
$data [ $i ] = $clauses[ $i ];
|
|
$j = $i + 1;
|
|
|
|
if (isset($clauses[ $j ]) && $clauses[ $j ][ 'content' ] && $this->hasNumbering($data[ $i ]) && ((! $this->hasNumbering($clauses[ $j ])) || (($this->hasNumbering($clauses[ $j ]) && is_numeric($clauses[ $j ][ 'numbering' ]) && count(array_filter(explode('.',
|
|
$clauses[ $j ][ 'numbering' ]))) > 1 && is_numeric($clauses[ $i ][ 'numbering' ]) && count(array_filter(explode('.',
|
|
$clauses[ $i ][ 'numbering' ]))) <= 1)))) {
|
|
|
|
|
|
for ($j; $j < count($clauses); $j++) {
|
|
|
|
if (isset($clauses[ $j ][ 'numbering' ]) && is_numeric($clauses[ $j ][ 'numbering' ]) && count(array_filter(explode('.',
|
|
$clauses[ $j ][ 'numbering' ]))) == 1) {
|
|
break;
|
|
}
|
|
|
|
$data[ $i ][ 'children' ][] = $clauses[ $j ];
|
|
$alreadyManaged[] = $j;
|
|
}
|
|
}
|
|
$alreadyManaged[] = $i;
|
|
if ($data[ $i ][ 'content' ]) {
|
|
$checkedClauses[] = $data[ $i ];
|
|
}
|
|
}
|
|
|
|
return $checkedClauses;
|
|
}
|
|
|
|
|
|
/**
|
|
* Build the child structure based on the spaces before the text
|
|
*
|
|
* @param $parent
|
|
* @param $child
|
|
*
|
|
*
|
|
* @return mixed
|
|
*/
|
|
private function handlePossibleChild($parent, $child)
|
|
{
|
|
if (empty($child[ 'content' ])) {
|
|
return $parent;
|
|
}
|
|
|
|
if ($this->pdf && ! isset($parent[ 'children' ]) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
|
|
1)) || in_array(substr(trim($child[ 'content' ]), 0, 1),
|
|
['}', ')']) || is_numeric(substr(trim($child[ 'content' ]), 0,
|
|
1)) || in_array(substr(trim($child[ 'content' ]), -1),
|
|
['.', ',', ':']) || (! in_array(substr(trim($child[ 'content' ]), -1), [
|
|
'.',
|
|
',',
|
|
':'
|
|
])) && $child[ 'spaces' ] > $parent[ 'spaces' ]) && ((in_array(substr(trim($parent[ 'content' ]),
|
|
-1), ['}', ')', ',', '"']) || ! in_array(substr(trim($parent[ 'content' ]), -1),
|
|
['.', ':', '!']) || ctype_lower(substr(trim($parent[ 'content' ]), -1))))) {
|
|
|
|
//dd($parent,$child);
|
|
$parent[ 'content' ] .= ' '.$child[ 'content' ];
|
|
|
|
return $parent;
|
|
} elseif ($this->pdf && isset($parent[ 'children' ]) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
|
|
1)) || in_array(substr(trim($child[ 'content' ]), 0, 1),
|
|
['}', ')']) || is_numeric(substr(trim($child[ 'content' ]), 0,
|
|
1)) || in_array(substr(trim($child[ 'content' ]), -1),
|
|
['.', ',', ':'])) && ((in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
|
|
-1), [
|
|
'}',
|
|
')',
|
|
',',
|
|
'"'
|
|
]) || ! in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]), -1),
|
|
['.', ':', '!']) || ctype_lower(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
|
|
-1))))) {
|
|
if (strpos($child[ 'content' ], 'thirty') !== false && $parent[ 'numbering' ] !== '1.') {
|
|
$lastParentChild = last($parent[ 'children' ]);
|
|
$lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
}
|
|
} elseif ($this->pdf && ! isset($parent[ 'children' ]) && $child[ 'spaces' ] >= $parent[ 'spaces' ] && ! $this->hasNumbering($child)) {
|
|
if ($this->hasChild($parent)) {
|
|
$lastParentChild = $this->getLastChildForParagraph($parent);
|
|
$lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
} else {
|
|
$parent[ 'content' ] .= ' '.$child[ 'content' ];
|
|
|
|
}
|
|
|
|
return $parent;
|
|
}
|
|
|
|
if (! isset($parent[ 'children' ])) {
|
|
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
return $parent;
|
|
}
|
|
$lastParentChild = last($parent[ 'children' ]);
|
|
|
|
if ($this->lastChildIsList($parent) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
|
|
1)) || in_array(substr(trim($child[ 'content' ]), -1), [';']) || strpos($child[ 'content' ],
|
|
':') !== false || in_array(trim(substr(trim($child[ 'content' ]), 0, 1)),
|
|
['{', '('])) && ! $this->hasNumbering($child)) {
|
|
|
|
if (! isset($lastParentChild[ 'children' ])) {
|
|
|
|
$lastParentChild[ 'children' ][] = $child;
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
}
|
|
|
|
if (isset($lastParentChild[ 'children' ]) && ! in_array(substr(last($lastParentChild[ 'children' ])[ 'content' ],
|
|
-1), ['.', ';', ',']) && ! in_array(substr(trim($child[ 'content' ]), 0, 1),
|
|
['(', '{', ':']) && ! $this->hasNumbering($child)) {
|
|
|
|
$lastParentChild[ 'children' ][ count($lastParentChild[ 'children' ]) - 1 ][ 'content' ] .= ' '.trim($child[ 'content' ]);
|
|
} else {
|
|
|
|
$lastParentChild[ 'children' ][] = $child;
|
|
}
|
|
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
}
|
|
|
|
if ($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && substr(trim($lastParentChild[ 'content' ]),
|
|
-1) == ':' && count(array_filter(str_split($lastParentChild[ 'numbering' ]),
|
|
'is_numeric')) < count(array_filter(str_split($child[ 'numbering' ]), 'is_numeric'))) {
|
|
|
|
$lastParentChild[ 'children' ][] = $child;
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
|
|
}
|
|
|
|
if ($lastParentChild[ 'spaces' ] == $child[ 'spaces' ]) {
|
|
|
|
if ($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && (in_array(substr(trim($lastParentChild[ 'content' ]),
|
|
-1), ['.', ';']) || $this->hasNumbering($child))) {
|
|
|
|
if (($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && ((int) substr($child[ 'numbering' ],
|
|
strrpos($child[ 'numbering' ], '.') + 1) - (int) substr($lastParentChild[ 'numbering' ],
|
|
strrpos($lastParentChild[ 'numbering' ],
|
|
'.') + 1) == 1)) || (in_array(utf8_encode(substr($lastParentChild[ 'content' ], 0,
|
|
1)), ['â', '"', '{']) && in_array(utf8_encode(substr($child[ 'content' ], 0, 1)),
|
|
['â', '"', '{', '•']))) {
|
|
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
} else {
|
|
|
|
$lastParentChild[ 'children' ][] = $child;
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
$lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
|
|
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
}
|
|
} elseif (! $this->hasNumbering($child) && ! in_array(substr(trim($lastParentChild[ 'content' ]), 0, 1),
|
|
['.', ';', '}']) && (ctype_lower(substr(trim($lastParentChild[ 'content' ]),
|
|
-1))) || in_array(substr(trim($lastParentChild[ 'content' ]), -1),
|
|
[',']) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
|
|
1)) || in_array(substr(trim($child[ 'content' ]), 0, 1), ['{', '(', ')']))) {
|
|
|
|
$lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
|
|
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
} else {
|
|
|
|
|
|
if ($this->hasChild($parent) && in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
|
|
-1), ['.', ';', '}'])) {
|
|
$lastParentChild[ 'children' ][] = $child;
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
}
|
|
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
|
|
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
}
|
|
|
|
return $parent;
|
|
}
|
|
|
|
|
|
/**
|
|
* Check if paragraph is a list
|
|
*
|
|
* @param $paragraph
|
|
*
|
|
* @return bool
|
|
*/
|
|
private function paragraphIsList($paragraph)
|
|
{
|
|
if (substr(trim($paragraph[ 'content' ]), -1) == ':') {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
* Check if last child from the paragraph is a list
|
|
*
|
|
* @param $paragraph
|
|
*
|
|
* @return bool
|
|
*/
|
|
private function lastChildIsList($paragraph)
|
|
{
|
|
if ($this->hasChild($paragraph)) {
|
|
$lastParentChild = last($paragraph[ 'children' ]);
|
|
if (substr(trim($lastParentChild[ 'content' ]), -1) == ':') {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
private function getLastChildForParagraph($paragraph)
|
|
{
|
|
if ($this->hasChild($paragraph)) {
|
|
$lastParentChild = last($paragraph[ 'children' ]);
|
|
|
|
return $this->getLastChildFromParagraph($lastParentChild);
|
|
}
|
|
|
|
return $paragraph;
|
|
}
|
|
|
|
|
|
/**
|
|
* Check if a paragraph has any child
|
|
*
|
|
* @param $paragraph
|
|
*
|
|
* @return bool
|
|
*/
|
|
private function hasChild($paragraph)
|
|
{
|
|
if (isset($paragraph[ 'children' ])) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
* Extract numbering from a given paragraph
|
|
*
|
|
* return false if has no numbering
|
|
*
|
|
* @param $paragraph
|
|
*
|
|
* @return false|mixed
|
|
*/
|
|
private function getNumbering($paragraph)
|
|
{
|
|
if (isset($paragraph)) {
|
|
$paragraphContent = trim($paragraph);
|
|
|
|
if (in_array(substr($paragraphContent, 0, 1), ['(', '{'])) {
|
|
return false;
|
|
}
|
|
if ($this->pdf && isset($paragraph) && strpos($paragraphContent,
|
|
'Page') !== false && strpos($paragraphContent, 'of') !== false) {
|
|
return false;
|
|
|
|
}
|
|
preg_match('/^([-+]?\d*\.?\d+?\d*\.?\d+|\d+(\.?)*)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
|
|
substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
|
|
preg_replace('/\)/', '.', preg_replace("/\{.+/", "", trim($paragraphContent))))), 0, 6)),
|
|
$paragraphNumbering);
|
|
|
|
|
|
|
|
if (count($paragraphNumbering) && (in_array(substr($paragraphContent, strlen($paragraphNumbering[ 0 ]), 1),
|
|
[' ', "\t", '.', ')']) || in_array(substr($paragraphNumbering[ 0 ], -1),
|
|
[' ', "\t", '.', ')']) || is_numeric($paragraphNumbering[ 0 ]))) {
|
|
$locationOfNumbering = strpos($paragraphContent,$paragraphNumbering[0]);
|
|
if(substr($paragraphContent,$locationOfNumbering-1,1)=='(' &&substr($paragraphContent,$locationOfNumbering+1,1)==')'){
|
|
return false;
|
|
|
|
}
|
|
return str_replace('..', '.', $paragraphNumbering[ 0 ]);
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
* Check if a paragraph is between clauses
|
|
*
|
|
* @param $first
|
|
* @param $paragraph
|
|
* @param $list
|
|
*
|
|
* @return bool
|
|
*/
|
|
private function paragraphBetweenClauses($first, $paragraph, $list)
|
|
{
|
|
if ($this->hasNumbering($first) && ! isset($paragraph[ 'numbering' ])) {
|
|
$firstNumberingString = $this->getLastChildFromParagraph($first);
|
|
if (isset($firstNumberingString[ 'numbering' ])) {
|
|
$firstNumbering = last(array_filter(explode('.', $firstNumberingString[ 'numbering' ])));
|
|
foreach ($list as $lastParagraph) {
|
|
if ($lastParagraphNumberingString = $this->getNumbering($lastParagraph)) {
|
|
$lastParagraphNumbering = last(array_filter(explode('.', $lastParagraphNumberingString)));
|
|
if ($lastParagraphNumbering - $firstNumbering == 1 && substr_count($firstNumberingString[ 'numbering' ],
|
|
'.') == substr_count($lastParagraphNumberingString, '.')) {
|
|
return true;
|
|
} elseif (substr_count($firstNumberingString[ 'numbering' ],
|
|
'.') > substr_count($lastParagraphNumberingString, '.')) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
}
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
private function getLastChildFromParagraph($paragraph)
|
|
{
|
|
if (isset($paragraph[ 'children' ])) {
|
|
return $this->getLastChildFromParagraph(last($paragraph[ 'children' ]));
|
|
}
|
|
|
|
return $paragraph;
|
|
}
|
|
|
|
|
|
private function appendToLastChildFromParagraph($paragraph, $append)
|
|
{
|
|
if (isset($paragraph[ 'children' ])) {
|
|
return $this->getLastChildFromParagraph(last($paragraph[ 'children' ]));
|
|
}
|
|
|
|
$paragraph[ 'content' ] .= ' '.$append[ 'content' ];
|
|
|
|
return $paragraph;
|
|
}
|
|
|
|
|
|
/**
|
|
* Check if a paragraph has numbering
|
|
*
|
|
* @param $paragraph
|
|
*
|
|
* @return bool
|
|
*/
|
|
private function hasNumbering($paragraph)
|
|
{
|
|
if (isset($paragraph[ 'numbering' ]) && $paragraph[ 'numbering' ]) {
|
|
return true;
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
|
|
/**
|
|
* Uppercase all values in the array
|
|
*
|
|
* @param $value
|
|
*
|
|
* @return array|string
|
|
*/
|
|
private function nestedUppercase($value)
|
|
{
|
|
if (is_array($value)) {
|
|
return array_map([$this, 'nestedUppercase'], $value);
|
|
}
|
|
|
|
//remove unwanted chars
|
|
return strtoupper(str_replace(['.'], '', $value));
|
|
}
|
|
|
|
}
|
|
|