|
|
@ -125,7 +125,6 @@ class ParseTextArray |
|
|
|
} else { |
|
|
|
Log::error('The given file dose not exists!'); |
|
|
|
} |
|
|
|
|
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
@ -144,6 +143,7 @@ class ParseTextArray |
|
|
|
|
|
|
|
$alreadyHandled = []; |
|
|
|
$countData = count($textAsArray); |
|
|
|
|
|
|
|
for ($i = 0; $i < $countData; $i++) { |
|
|
|
if (array_key_exists($i, $alreadyHandled)) { |
|
|
|
continue; |
|
|
@ -154,16 +154,19 @@ class ParseTextArray |
|
|
|
'content' => trim($textAsArray[ $i ]), |
|
|
|
'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ])) |
|
|
|
]; |
|
|
|
|
|
|
|
//Remove numbering from the paragraph content
|
|
|
|
if ($numbering = $this->getNumbering($textAsArray[ $i ])) { |
|
|
|
$data[ $i ][ 'numbering' ] = $numbering; |
|
|
|
$data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.')); |
|
|
|
} |
|
|
|
|
|
|
|
if ($this->pdf && strpos($textAsArray[ $i ], 'Page') !== false && strpos($textAsArray[ $i ], |
|
|
|
'of') !== false) { |
|
|
|
$alreadyHandled[] = $i; |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
$j = $i + 1; |
|
|
|
|
|
|
|
if (isset($textAsArray[ $j ])) { |
|
|
@ -171,67 +174,91 @@ class ParseTextArray |
|
|
|
if (array_key_exists($j, $alreadyHandled)) { |
|
|
|
continue; |
|
|
|
} |
|
|
|
if ($this->pdf && isset($textAsArray[ $j ]) && strpos($textAsArray[ $j ], |
|
|
|
'Page') !== false && strpos($textAsArray[ $j ], 'of') !== false) { |
|
|
|
|
|
|
|
if ( |
|
|
|
$this->pdf && |
|
|
|
isset($textAsArray[ $j ]) && |
|
|
|
strpos($textAsArray[ $j ], 'Page') !== false && |
|
|
|
strpos($textAsArray[ $j ], 'of') !== false |
|
|
|
) { |
|
|
|
$alreadyHandled[] = $j; |
|
|
|
continue; |
|
|
|
|
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
//extract the content and count the number of the empty spaces from the begining
|
|
|
|
// Extract the content and count the number of the empty spaces from the beginning
|
|
|
|
$data[ $j ] = [ |
|
|
|
'content' => trim($textAsArray[ $j ]), |
|
|
|
'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ])) |
|
|
|
]; |
|
|
|
//Remove numbering from the paragraph content
|
|
|
|
|
|
|
|
// Remove numbering from the paragraph content
|
|
|
|
if ($numbering = $this->getNumbering($textAsArray[ $j ])) { |
|
|
|
$data[ $j ][ 'numbering' ] = $numbering; |
|
|
|
$data[ $j ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), |
|
|
|
'.')); |
|
|
|
$data[ $j ][ 'content' ] = trim( |
|
|
|
ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), '.') |
|
|
|
); |
|
|
|
} |
|
|
|
|
|
|
|
//break if outh have numbering and the space is equal
|
|
|
|
if ($data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && $this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && substr_count($data[ $i ][ 'numbering' ], |
|
|
|
'.') == substr_count($data[ $j ][ 'numbering' ], |
|
|
|
'.') && count(array_filter(str_split($data[ $i ][ 'numbering' ]), |
|
|
|
'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), |
|
|
|
'is_numeric'))) { |
|
|
|
// Break if both have numbering and the space is equal
|
|
|
|
if ( |
|
|
|
$data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && |
|
|
|
$this->hasNumbering($data[ $j ]) && |
|
|
|
$this->hasNumbering($data[ $i ]) && |
|
|
|
substr_count($data[ $i ][ 'numbering' ], '.') == substr_count($data[ $j ][ 'numbering' ], '.') && |
|
|
|
count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric'))) { |
|
|
|
|
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
if ($this->hasNumbering($data[ $j ]) && ! $this->hasNumbering($data[ $i ]) && ! $data[ $i ][ 'spaces' ] && $data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && ! in_array(substr($data[ $i ][ 'content' ], |
|
|
|
-1), [':'])) { |
|
|
|
|
|
|
|
if ( |
|
|
|
$this->hasNumbering($data[ $j ]) && |
|
|
|
! $this->hasNumbering($data[ $i ]) && |
|
|
|
! $data[ $i ][ 'spaces' ] && |
|
|
|
$data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && |
|
|
|
! in_array(substr($data[ $i ][ 'content' ], -1), [':']) |
|
|
|
) { |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
if ($this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1) { |
|
|
|
|
|
|
|
if ( |
|
|
|
$this->hasNumbering($data[ $j ]) && |
|
|
|
$this->hasNumbering($data[ $i ]) && |
|
|
|
((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1 |
|
|
|
) { |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
if ($this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1) { |
|
|
|
if ( |
|
|
|
$this->hasNumbering($data[ $j ]) && |
|
|
|
$this->hasNumbering($data[ $i ]) && |
|
|
|
((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1 |
|
|
|
) { |
|
|
|
|
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
//Hardcoded breakpoints
|
|
|
|
if ($this->hasNumbering($data[ $j ]) && in_array(strtoupper(str_replace(['.', "\t", " "], '', |
|
|
|
$data[ $j ][ 'content' ])), $this->breakPoints)) { |
|
|
|
// Hardcoded breakpoints
|
|
|
|
if ( |
|
|
|
$this->hasNumbering($data[ $j ]) && |
|
|
|
in_array(strtoupper(str_replace(['.', "\t", " "], '', $data[ $j ][ 'content' ])), $this->breakPoints) |
|
|
|
) { |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
//Hardcoded "Schedule break"
|
|
|
|
if (! $this->hasNumbering($data[ $j ]) && strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), |
|
|
|
0, 10), 'schedule') !== false) { |
|
|
|
// Hardcoded "Schedule break"
|
|
|
|
if ( |
|
|
|
! $this->hasNumbering($data[ $j ]) && |
|
|
|
strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), 0, 10), 'schedule') !== false |
|
|
|
) { |
|
|
|
break; |
|
|
|
} |
|
|
|
|
|
|
|
if (! $this->hasNumbering($data[ $j ]) && strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), |
|
|
|
'Exhibit') !== false && ! in_array(substr(trim($data[ $j ][ 'content' ]), -1), [ |
|
|
|
'.' |
|
|
|
])) { |
|
|
|
if ( |
|
|
|
! $this->hasNumbering($data[ $j ]) && |
|
|
|
strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), 'Exhibit') !== false && |
|
|
|
! in_array(substr(trim($data[ $j ][ 'content' ]), -1), ['.']) |
|
|
|
) { |
|
|
|
|
|
|
|
break; |
|
|
|
} |
|
|
|