breakPoints = $this->nestedUppercase($this->breakPoints); $this->pdf = $pdf; } public function fromFile($filePath) { if (file_exists($filePath)) { $fileContent = file_get_contents($filePath); return $this->buildTheStructure(array_filter(explode(PHP_EOL, $fileContent))); } Log::error('The given file does not exists!'); return ''; } /** * Build the child structure and extract relevant data from the text content * * * @param $textAsArray * * @return array */ private function buildTheStructure($textAsArray) { $textAsArray = array_values($textAsArray); $response = []; $alreadyHandled = []; $countData = count($textAsArray); for ($i = 0; $i < $countData; $i++) { if (array_key_exists($i, $alreadyHandled)) { continue; } // Extract the content and count the number of the empty spaces from the beginning. $data[ $i ] = [ 'content' => trim($textAsArray[ $i ]), 'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ])) ]; //Remove numbering from the paragraph content if ($numbering = $this->getNumbering($textAsArray[ $i ])) { $data[ $i ][ 'numbering' ] = $numbering; $data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.')); } if ( $this->pdf && strpos($textAsArray[ $i ], 'Page') !== false && strpos($textAsArray[ $i ], 'of') !== false ) { $alreadyHandled[] = $i; break; } $j = $i + 1; if (isset($textAsArray[ $j ])) { for ($j; $j < $countData; $j++) { if (array_key_exists($j, $alreadyHandled)) { continue; } if ( $this->pdf && isset($textAsArray[ $j ]) && strpos($textAsArray[ $j ], 'Page') !== false && strpos($textAsArray[ $j ], 'of') !== false ) { $alreadyHandled[] = $j; continue; } // Extract the content and count the number of the empty spaces from the beginning $data[ $j ] = [ 'content' => trim($textAsArray[ $j ]), 'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ])) ]; // Remove numbering from the paragraph content if ($numbering = $this->getNumbering($textAsArray[ $j ])) { $data[ $j ][ 'numbering' ] = $numbering; $data[ $j ][ 'content' ] = trim( ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), '.') ); } // Break if both have numbering and the space is equal if ( $data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && $this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && substr_count($data[ $i ][ 'numbering' ], '.') == substr_count($data[ $j ][ 'numbering' ], '.') && count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric'))) { break; } if ( $this->hasNumbering($data[ $j ]) && ! $this->hasNumbering($data[ $i ]) && ! $data[ $i ][ 'spaces' ] && $data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && ! in_array(substr($data[ $i ][ 'content' ], -1), [':']) ) { break; } if ( $this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1 ) { break; } if ( $this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1 ) { break; } // Hardcoded breakpoints if ( $this->hasNumbering($data[ $j ]) && in_array(strtoupper(str_replace(['.', "\t", "​ "], '', $data[ $j ][ 'content' ])), $this->breakPoints) ) { break; } // Hardcoded "Schedule break" if ( ! $this->hasNumbering($data[ $j ]) && strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), 0, 10), 'schedule') !== false ) { break; } if ( ! $this->hasNumbering($data[ $j ]) && strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), 'Exhibit') !== false && ! in_array(substr(trim($data[ $j ][ 'content' ]), -1), ['.']) ) { break; } if (strpos(substr(trim(strtolower($data[ $j ][ 'content' ])), 0, 15), 'attachment') !== false) { break; } if ($this->hasNumbering($data[ $j ]) && $this->hasChild($data[ $i ])) { if ($this->hasNumbering(last($data[ $i ][ 'children' ])) && (is_numeric(last($data[ $i ][ 'children' ])[ 'numbering' ]) && strpos(last($data[ $i ][ 'children' ])[ 'numbering' ], ".") !== false) && (is_numeric($data[ $j ][ 'numbering' ]) && strpos($data[ $j ][ 'numbering' ], ".") === false)) { break; } } if ($data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && strlen($data[ $i ][ 'content' ]) && strlen($data[ $j ][ 'content' ])) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } elseif (isset($textAsArray[ $j + 1 ]) && $this->paragraphBetweenClauses($data[ $i ], $data[ $j ], array_slice($textAsArray, $j + 1))) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } elseif ($this->hasChild($data[ $i ]) && $this->lastChildIsList($data[ $i ]) && ($data[ $i ][ 'spaces' ] == 0 || $data[ $i ][ 'spaces' ] > $data[ $j ][ 'spaces' ])) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } elseif ($data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && (substr_count($data[ $i ][ 'numbering' ], '.') < substr_count($data[ $j ][ 'numbering' ], '.') || count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) < count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric')))) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } else { if ($this->paragraphIsList($data[ $i ]) && (ctype_lower(substr($data[ $j ][ 'content' ], 0, 1)) || in_array(substr($data[ $j ][ 'content' ], 0, 1), ['{', '•']))) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } elseif ($this->hasNumbering($data[ $i ]) && $this->hasNumbering($data[ $j ]) && is_numeric($data[ $j ][ 'numbering' ]) && strpos($data[ $j ][ 'numbering' ], ".") !== false && strpos($data[ $i ][ 'numbering' ], ".") === false && ! is_int($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ])) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } elseif ($this->hasChild($data[ $i ]) && ($data[ $j ][ 'spaces' ] == $this->getLastChildForParagraph($data[ $i ])[ 'spaces' ])) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } elseif (strpos(strtolower($data[ $i ][ 'content' ]), 'definitions and') !== false && in_array(utf8_encode(substr($data[ $j ][ 'content' ], 0, 1)), ['â', '"'])) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } elseif ($this->hasChild($data[ $i ]) && $this->paragraphIsList($this->getLastChildFromParagraph($data[ $i ]))) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } elseif (($this->hasChild($data[ $i ]) || $data[ $i ][ 'spaces' ] == $data[ $j ][ 'spaces' ]) && ! $this->hasNumbering($this->getLastChildForParagraph($data[ $i ])) && ! $this->hasNumbering($data[ $j ])) { $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]); $alreadyHandled[] = $j; } else { break; } } } } if (strlen($data[ $i ][ 'content' ])) { $response[] = $data[ $i ]; } $alreadyHandled[] = $i; } return $this->recheckClauses($response); } /** * Recheck missed clauses and assign them to a parent if is the case * * @param $clauses * * @return array */ private function recheckClauses($clauses) { $checkedClauses = []; $alreadyManaged = []; for ($i = 0; $i < count($clauses); $i++) { if (array_key_exists($i, $alreadyManaged)) { continue; } $data [ $i ] = $clauses[ $i ]; $j = $i + 1; if ( isset($clauses[ $j ]) && $clauses[ $j ][ 'content' ] && $this->hasNumbering($data[ $i ]) && ( (! $this->hasNumbering($clauses[ $j ])) || ( $this->hasNumbering($clauses[ $j ]) && is_numeric($clauses[ $j ][ 'numbering' ]) && count(array_filter(explode('.', $clauses[ $j ][ 'numbering' ]))) > 1 && is_numeric($clauses[ $i ][ 'numbering' ]) && count(array_filter(explode('.', $clauses[ $i ][ 'numbering' ]))) <= 1 ) ) ) { for ($j; $j < count($clauses); $j++) { if ( isset($clauses[ $j ][ 'numbering' ]) && is_numeric($clauses[ $j ][ 'numbering' ]) && count(array_filter(explode('.', $clauses[ $j ][ 'numbering' ]))) == 1 ) { break; } $data[ $i ][ 'children' ][] = $clauses[ $j ]; $alreadyManaged[] = $j; } } $alreadyManaged[] = $i; if ($data[ $i ][ 'content' ]) { $checkedClauses[] = $data[ $i ]; } } return $checkedClauses; } /** * Build the child structure based on the spaces before the text * * @param $parent * @param $child * * * @return mixed */ private function handlePossibleChild($parent, $child) { if (empty($child[ 'content' ])) { return $parent; } if ($this->pdf && ! isset($parent[ 'children' ]) && (ctype_lower(substr(trim($child[ 'content' ]), 0, 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1), ['}', ')']) || is_numeric(substr(trim($child[ 'content' ]), 0, 1)) || in_array(substr(trim($child[ 'content' ]), -1), ['.', ',', ':']) || (! in_array(substr(trim($child[ 'content' ]), -1), [ '.', ',', ':' ])) && $child[ 'spaces' ] > $parent[ 'spaces' ]) && ((in_array(substr(trim($parent[ 'content' ]), -1), ['}', ')', ',', '"']) || ! in_array(substr(trim($parent[ 'content' ]), -1), ['.', ':', '!']) || ctype_lower(substr(trim($parent[ 'content' ]), -1))))) { //dd($parent,$child); $parent[ 'content' ] .= ' '.$child[ 'content' ]; return $parent; } elseif ($this->pdf && isset($parent[ 'children' ]) && (ctype_lower(substr(trim($child[ 'content' ]), 0, 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1), ['}', ')']) || is_numeric(substr(trim($child[ 'content' ]), 0, 1)) || in_array(substr(trim($child[ 'content' ]), -1), ['.', ',', ':'])) && ((in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]), -1), [ '}', ')', ',', '"' ]) || ! in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]), -1), ['.', ':', '!']) || ctype_lower(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]), -1))))) { if (strpos($child[ 'content' ], 'thirty') !== false && $parent[ 'numbering' ] !== '1.') { $lastParentChild = last($parent[ 'children' ]); $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ]; $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; return $parent; } } elseif ($this->pdf && ! isset($parent[ 'children' ]) && $child[ 'spaces' ] >= $parent[ 'spaces' ] && ! $this->hasNumbering($child)) { if ($this->hasChild($parent)) { $lastParentChild = $this->getLastChildForParagraph($parent); $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ]; $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; } else { $parent[ 'content' ] .= ' '.$child[ 'content' ]; } return $parent; } if (! isset($parent[ 'children' ])) { $parent[ 'children' ][] = $child; return $parent; } $lastParentChild = last($parent[ 'children' ]); if ($this->lastChildIsList($parent) && (ctype_lower(substr(trim($child[ 'content' ]), 0, 1)) || in_array(substr(trim($child[ 'content' ]), -1), [';']) || strpos($child[ 'content' ], ':') !== false || in_array(trim(substr(trim($child[ 'content' ]), 0, 1)), ['{', '('])) && ! $this->hasNumbering($child)) { if (! isset($lastParentChild[ 'children' ])) { $lastParentChild[ 'children' ][] = $child; $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; return $parent; } if (isset($lastParentChild[ 'children' ]) && ! in_array(substr(last($lastParentChild[ 'children' ])[ 'content' ], -1), ['.', ';', ',']) && ! in_array(substr(trim($child[ 'content' ]), 0, 1), ['(', '{', ':']) && ! $this->hasNumbering($child)) { $lastParentChild[ 'children' ][ count($lastParentChild[ 'children' ]) - 1 ][ 'content' ] .= ' '.trim($child[ 'content' ]); } else { $lastParentChild[ 'children' ][] = $child; } $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; return $parent; } if ($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && substr(trim($lastParentChild[ 'content' ]), -1) == ':' && count(array_filter(str_split($lastParentChild[ 'numbering' ]), 'is_numeric')) < count(array_filter(str_split($child[ 'numbering' ]), 'is_numeric'))) { $lastParentChild[ 'children' ][] = $child; $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; return $parent; } if ($lastParentChild[ 'spaces' ] == $child[ 'spaces' ]) { if ($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && (in_array(substr(trim($lastParentChild[ 'content' ]), -1), ['.', ';']) || $this->hasNumbering($child))) { if (($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && ((int) substr($child[ 'numbering' ], strrpos($child[ 'numbering' ], '.') + 1) - (int) substr($lastParentChild[ 'numbering' ], strrpos($lastParentChild[ 'numbering' ], '.') + 1) == 1)) || (in_array(utf8_encode(substr($lastParentChild[ 'content' ], 0, 1)), ['â', '"', '{']) && in_array(utf8_encode(substr($child[ 'content' ], 0, 1)), ['â', '"', '{', '•']))) { $parent[ 'children' ][] = $child; } else { $lastParentChild[ 'children' ][] = $child; $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; } } else { $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ]; $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; } } elseif (! $this->hasNumbering($child) && ! in_array(substr(trim($lastParentChild[ 'content' ]), 0, 1), ['.', ';', '}']) && (ctype_lower(substr(trim($lastParentChild[ 'content' ]), -1))) || in_array(substr(trim($lastParentChild[ 'content' ]), -1), [',']) && (ctype_lower(substr(trim($child[ 'content' ]), 0, 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1), ['{', '(', ')']))) { $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ]; $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; } else { if ($this->hasChild($parent) && in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]), -1), ['.', ';', '}'])) { $lastParentChild[ 'children' ][] = $child; $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; return $parent; } $lastParentChild = $this->handlePossibleChild($lastParentChild, $child); $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; } return $parent; } /** * Check if paragraph is a list * * @param $paragraph * * @return bool */ private function paragraphIsList($paragraph) { return substr(trim($paragraph[ 'content' ]), -1) === ':'; } /** * Check if last child from the paragraph is a list * * @param $paragraph * * @return bool */ private function lastChildIsList($paragraph) { if ($this->hasChild($paragraph)) { $lastParentChild = last($paragraph[ 'children' ]); if (substr(trim($lastParentChild[ 'content' ]), -1) == ':') { return true; } } return false; } private function getLastChildForParagraph($paragraph) { if ($this->hasChild($paragraph)) { $lastParentChild = last($paragraph[ 'children' ]); return $this->getLastChildFromParagraph($lastParentChild); } return $paragraph; } /** * Check if a paragraph has any child * * @param $paragraph * * @return bool */ private function hasChild($paragraph) { if (isset($paragraph[ 'children' ])) { return true; } return false; } /** * Extract numbering from a given paragraph * * return false if has no numbering * * @param $paragraph * * @return false|mixed */ private function getNumbering($paragraph) { if (isset($paragraph)) { $paragraphContent = trim($paragraph); if (in_array(substr($paragraphContent, 0, 1), ['(', '{'])) { return false; } if ($this->pdf && isset($paragraph) && strpos($paragraphContent, 'Page') !== false && strpos($paragraphContent, 'of') !== false) { return false; } preg_match('/^([-+]?\d*\.?\d+?\d*\.?\d+|\d+(\.?)*)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '', substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '', preg_replace('/\)/', '.', preg_replace("/\{.+/", "", trim($paragraphContent))))), 0, 6)), $paragraphNumbering); if (count($paragraphNumbering) && (in_array(substr($paragraphContent, strlen($paragraphNumbering[ 0 ]), 1), [' ', "\t", '.', ')']) || in_array(substr($paragraphNumbering[ 0 ], -1), [' ', "\t", '.', ')']) || is_numeric($paragraphNumbering[ 0 ]))) { $locationOfNumbering = strpos($paragraphContent,$paragraphNumbering[0]); if(substr($paragraphContent,$locationOfNumbering-1,1)=='(' &&substr($paragraphContent,$locationOfNumbering+1,1)==')'){ return false; } return str_replace('..', '.', $paragraphNumbering[ 0 ]); } return false; } return false; } /** * Check if a paragraph is between clauses * * @param $first * @param $paragraph * @param $list * * @return bool */ private function paragraphBetweenClauses($first, $paragraph, $list) { if ($this->hasNumbering($first) && ! isset($paragraph[ 'numbering' ])) { $firstNumberingString = $this->getLastChildFromParagraph($first); if (isset($firstNumberingString[ 'numbering' ])) { $firstNumbering = last(array_filter(explode('.', $firstNumberingString[ 'numbering' ]))); foreach ($list as $lastParagraph) { if ($lastParagraphNumberingString = $this->getNumbering($lastParagraph)) { $lastParagraphNumbering = last(array_filter(explode('.', $lastParagraphNumberingString))); if ($lastParagraphNumbering - $firstNumbering == 1 && substr_count($firstNumberingString[ 'numbering' ], '.') == substr_count($lastParagraphNumberingString, '.')) { return true; } elseif (substr_count($firstNumberingString[ 'numbering' ], '.') > substr_count($lastParagraphNumberingString, '.')) { return true; } return false; } } } return false; } return false; } private function getLastChildFromParagraph($paragraph) { if (isset($paragraph[ 'children' ])) { return $this->getLastChildFromParagraph(last($paragraph[ 'children' ])); } return $paragraph; } private function appendToLastChildFromParagraph($paragraph, $append) { if (isset($paragraph[ 'children' ])) { return $this->getLastChildFromParagraph(last($paragraph[ 'children' ])); } $paragraph[ 'content' ] .= ' '.$append[ 'content' ]; return $paragraph; } /** * Check if a paragraph has numbering * * @param $paragraph * * @return bool */ private function hasNumbering($paragraph) { if (isset($paragraph[ 'numbering' ]) && $paragraph[ 'numbering' ]) { return true; } return false; } /** * Uppercase all values in the array * * @param $value * * @return array|string */ private function nestedUppercase($value) { if (is_array($value)) { return array_map([$this, 'nestedUppercase'], $value); } //remove unwanted chars return strtoupper(str_replace(['.'], '', $value)); } }