Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

774 lines
29 KiB

  1. <?php
  2. namespace App\Parser;
  3. use Illuminate\Support\Facades\Log;
  4. class ParseTextArray
  5. {
  6. /**
  7. * @var array
  8. */
  9. private $breakPoints = [
  10. 'TERMS OF THE {P1_Pros}',
  11. 'TERMS AND CONDITIONS',
  12. 'BACKGROUND',
  13. 'OPERATIVE PROVISIONS',
  14. 'Products and/or Services',
  15. 'PAYMENT',
  16. 'GRANT OF LICENCE',
  17. 'TERM OF LICENCE AGREEMENT',
  18. 'ROYALTY',
  19. 'PAYMENT',
  20. 'PERFORMANCE TARGETS',
  21. 'STATIONERY',
  22. 'QUALITY CONTROL',
  23. 'THE DISTRIBUTOR\'S OBLIGATIONS',
  24. 'NON SOLICITATION',
  25. 'SALE OF BUSINESS',
  26. 'TERMINATION OF AGREEMENT',
  27. 'CONDITIONS FOLLOWING TERMINATION',
  28. 'RESTRAINT',
  29. 'TIME OF ESSENCE AND NOTICES',
  30. 'INTERPRETATION',
  31. 'ARBITRATION',
  32. 'DOMICILIUM AND REGISTERED OFFICE',
  33. 'USE OF TRADE MARKS, TRADE NAME, GOODWILL AND KNOW-HOW',
  34. 'GENERAL',
  35. 'DESCRIPTION OF {P2_NAME} INFORMATION',
  36. 'PAYMENT OF FEES',
  37. 'SUPPLIER\'S STATUS',
  38. 'SUPPLIER\’S OBLIGATIONS',
  39. 'DEFINITIONS AND INTERPRETATION',
  40. 'DEFINITIONS',
  41. 'CONFIDENTIALITY',
  42. 'TERMINATION',
  43. 'RESTRICTIVE COVENANTS AND INTELLECTUAL PROPERTY',
  44. 'DETAILS AND IDENTITY OF CONSULTANT',
  45. 'ANTI-BRIBERY',
  46. 'ASSIGNMENT SCHEDULE',
  47. 'SCHEDULE 1',
  48. '{P1_NAME}\'S LIABILITY',
  49. 'DURATION OF AGREEMENT AND SUPPLY',
  50. 'SUPPLY OF HARDWARE',
  51. 'SUPPLY OF SOFTWARE AND DOCUMENTATION',
  52. 'SUPPLY OF SUPPORT SERVICES',
  53. 'INTELLECTUAL PROPERTY RIGHTS',
  54. 'THE CONTRACT',
  55. '{P1_NAME}\U2019S LIABILITY',
  56. 'UPDATES',
  57. 'TERMS OF THE {P1_NAME} PRODUCTS.',
  58. 'CUSTOMER RESPONSIBILITIES',
  59. 'EXHIBIT A',
  60. 'EXHIBIT A-1',
  61. 'EXHIBIT A-2',
  62. 'WARRANTIES',
  63. 'EXIT, TERMINATION AND SUSPENSION',
  64. 'EXHIBIT B',
  65. 'EXHIBIT B-1',
  66. 'EXHIBIT B-2',
  67. 'COUNTERPARTS',
  68. 'LICENSE GRANT',
  69. 'INDEMNIFICATION BY CUSTOMER',
  70. 'TERMS OF THE {P1_NAME} PRODUCTS',
  71. 'TERMS OF CLOUD SERVICE',
  72. 'INDEMNIFICATION BY CUSTOMER',
  73. 'TERMINATION',
  74. 'TERMS OF THE {P1_PROS}',
  75. 'SUPPORT',
  76. 'SUB CONTRACTING AND THIRD PARTY RECOMMENDATIONS',
  77. 'LICENCE AND ACCESS TO SOFTWARE AND HARDWARE',
  78. 'DECLARATION OF NON-LIAISON AND ANTI-CORRUPTION COMMITMENT',
  79. '{P1_NAME}\'S DUTIES',
  80. 'ana are mere',
  81. 'definitions',
  82. 'fees',
  83. 'ENGAGEMENT',
  84. 'DUTIES',
  85. 'TERMINATION',
  86. 'STATEMENTS',
  87. 'CONFIDENTIALITY',
  88. 'Human rights',
  89. 'Labour',
  90. 'Environment',
  91. 'Anti-corruption',
  92. 'Services',
  93. 'Scope of the Agreement',
  94. 'Staffing Levels for Services','Indemnification'
  95. ];
  96. /**
  97. * @var bool
  98. */
  99. private $pdf;
  100. /**
  101. * ParseTextArray constructor.
  102. *
  103. * @param bool $pdf
  104. */
  105. public function __construct($pdf = false)
  106. {
  107. $this->breakPoints = $this->nestedUppercase($this->breakPoints);
  108. $this->pdf = $pdf;
  109. }
  110. public function fromFile($filePath)
  111. {
  112. if (file_exists($filePath)) {
  113. $fileContent = file_get_contents($filePath);
  114. return $this->buildTheStructure(array_filter(explode(PHP_EOL, $fileContent)));
  115. } else {
  116. Log::error('The given file dose not exists!');
  117. }
  118. }
  119. /**
  120. * Build the child structure and extract relevant data from the text content
  121. *
  122. *
  123. * @param $textAsArray
  124. *
  125. * @return array
  126. */
  127. private function buildTheStructure($textAsArray)
  128. {
  129. $textAsArray = array_values($textAsArray);
  130. $response = [];
  131. $alreadyHandled = [];
  132. $countData = count($textAsArray);
  133. for ($i = 0; $i < $countData; $i++) {
  134. if (array_key_exists($i, $alreadyHandled)) {
  135. continue;
  136. }
  137. //extract the content and count the number of the empty spaces from the begining
  138. $data[ $i ] = [
  139. 'content' => trim($textAsArray[ $i ]),
  140. 'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ]))
  141. ];
  142. //Remove numbering from the paragraph content
  143. if ($numbering = $this->getNumbering($textAsArray[ $i ])) {
  144. $data[ $i ][ 'numbering' ] = $numbering;
  145. $data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.'));
  146. }
  147. if ($this->pdf && strpos($textAsArray[ $i ], 'Page') !== false && strpos($textAsArray[ $i ],
  148. 'of') !== false) {
  149. $alreadyHandled[] = $i;
  150. break;
  151. }
  152. $j = $i + 1;
  153. if (isset($textAsArray[ $j ])) {
  154. for ($j; $j < $countData; $j++) {
  155. if (array_key_exists($j, $alreadyHandled)) {
  156. continue;
  157. }
  158. if (
  159. $this->pdf &&
  160. isset($textAsArray[ $j ]) &&
  161. strpos($textAsArray[ $j ], 'Page') !== false &&
  162. strpos($textAsArray[ $j ], 'of') !== false
  163. ) {
  164. $alreadyHandled[] = $j;
  165. continue;
  166. }
  167. // Extract the content and count the number of the empty spaces from the beginning
  168. $data[ $j ] = [
  169. 'content' => trim($textAsArray[ $j ]),
  170. 'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ]))
  171. ];
  172. // Remove numbering from the paragraph content
  173. if ($numbering = $this->getNumbering($textAsArray[ $j ])) {
  174. $data[ $j ][ 'numbering' ] = $numbering;
  175. $data[ $j ][ 'content' ] = trim(
  176. ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), '.')
  177. );
  178. }
  179. // Break if both have numbering and the space is equal
  180. if (
  181. $data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] &&
  182. $this->hasNumbering($data[ $j ]) &&
  183. $this->hasNumbering($data[ $i ]) &&
  184. substr_count($data[ $i ][ 'numbering' ], '.') == substr_count($data[ $j ][ 'numbering' ], '.') &&
  185. count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric'))) {
  186. break;
  187. }
  188. if (
  189. $this->hasNumbering($data[ $j ]) &&
  190. ! $this->hasNumbering($data[ $i ]) &&
  191. ! $data[ $i ][ 'spaces' ] &&
  192. $data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] &&
  193. ! in_array(substr($data[ $i ][ 'content' ], -1), [':'])
  194. ) {
  195. break;
  196. }
  197. if (
  198. $this->hasNumbering($data[ $j ]) &&
  199. $this->hasNumbering($data[ $i ]) &&
  200. ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
  201. ) {
  202. break;
  203. }
  204. if (
  205. $this->hasNumbering($data[ $j ]) &&
  206. $this->hasNumbering($data[ $i ]) &&
  207. ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
  208. ) {
  209. break;
  210. }
  211. // Hardcoded breakpoints
  212. if (
  213. $this->hasNumbering($data[ $j ]) &&
  214. in_array(strtoupper(str_replace(['.', "\t", ""], '', $data[ $j ][ 'content' ])), $this->breakPoints)
  215. ) {
  216. break;
  217. }
  218. // Hardcoded "Schedule break"
  219. if (
  220. ! $this->hasNumbering($data[ $j ]) &&
  221. strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), 0, 10), 'schedule') !== false
  222. ) {
  223. break;
  224. }
  225. if (
  226. ! $this->hasNumbering($data[ $j ]) &&
  227. strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), 'Exhibit') !== false &&
  228. ! in_array(substr(trim($data[ $j ][ 'content' ]), -1), ['.'])
  229. ) {
  230. break;
  231. }
  232. if (strpos(substr(trim(strtolower($data[ $j ][ 'content' ])), 0, 15), 'attachment') !== false) {
  233. break;
  234. }
  235. if ($this->hasNumbering($data[ $j ]) && $this->hasChild($data[ $i ])) {
  236. if ($this->hasNumbering(last($data[ $i ][ 'children' ])) && (is_numeric(last($data[ $i ][ 'children' ])[ 'numbering' ]) && strpos(last($data[ $i ][ 'children' ])[ 'numbering' ],
  237. ".") !== false) && (is_numeric($data[ $j ][ 'numbering' ]) && strpos($data[ $j ][ 'numbering' ],
  238. ".") === false)) {
  239. break;
  240. }
  241. }
  242. if ($data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && strlen($data[ $i ][ 'content' ]) && strlen($data[ $j ][ 'content' ])) {
  243. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  244. $alreadyHandled[] = $j;
  245. } elseif (isset($textAsArray[ $j + 1 ]) && $this->paragraphBetweenClauses($data[ $i ], $data[ $j ],
  246. array_slice($textAsArray, $j + 1))) {
  247. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  248. $alreadyHandled[] = $j;
  249. } elseif ($this->hasChild($data[ $i ]) && $this->lastChildIsList($data[ $i ]) && ($data[ $i ][ 'spaces' ] == 0 || $data[ $i ][ 'spaces' ] > $data[ $j ][ 'spaces' ])) {
  250. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  251. $alreadyHandled[] = $j;
  252. } elseif ($data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && (substr_count($data[ $i ][ 'numbering' ],
  253. '.') < substr_count($data[ $j ][ 'numbering' ],
  254. '.') || count(array_filter(str_split($data[ $i ][ 'numbering' ]),
  255. 'is_numeric')) < count(array_filter(str_split($data[ $j ][ 'numbering' ]),
  256. 'is_numeric')))) {
  257. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  258. $alreadyHandled[] = $j;
  259. } else {
  260. if ($this->paragraphIsList($data[ $i ]) && (ctype_lower(substr($data[ $j ][ 'content' ], 0,
  261. 1)) || in_array(substr($data[ $j ][ 'content' ], 0, 1), ['{', '•']))) {
  262. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  263. $alreadyHandled[] = $j;
  264. } elseif ($this->hasNumbering($data[ $i ]) && $this->hasNumbering($data[ $j ]) && is_numeric($data[ $j ][ 'numbering' ]) && strpos($data[ $j ][ 'numbering' ],
  265. ".") !== false && strpos($data[ $i ][ 'numbering' ],
  266. ".") === false && ! is_int($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ])) {
  267. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  268. $alreadyHandled[] = $j;
  269. } elseif ($this->hasChild($data[ $i ]) && ($data[ $j ][ 'spaces' ] == $this->getLastChildForParagraph($data[ $i ])[ 'spaces' ])) {
  270. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  271. $alreadyHandled[] = $j;
  272. } elseif (strpos(strtolower($data[ $i ][ 'content' ]),
  273. 'definitions and') !== false && in_array(utf8_encode(substr($data[ $j ][ 'content' ], 0,
  274. 1)), ['â', '"'])) {
  275. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  276. $alreadyHandled[] = $j;
  277. } elseif ($this->hasChild($data[ $i ]) && $this->paragraphIsList($this->getLastChildFromParagraph($data[ $i ]))) {
  278. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  279. $alreadyHandled[] = $j;
  280. } elseif (($this->hasChild($data[ $i ]) || $data[ $i ][ 'spaces' ] == $data[ $j ][ 'spaces' ]) && ! $this->hasNumbering($this->getLastChildForParagraph($data[ $i ])) && ! $this->hasNumbering($data[ $j ])) {
  281. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  282. $alreadyHandled[] = $j;
  283. } else {
  284. break;
  285. }
  286. }
  287. }
  288. }
  289. if (strlen($data[ $i ][ 'content' ])) {
  290. $response[] = $data[ $i ];
  291. }
  292. $alreadyHandled[] = $i;
  293. }
  294. return $this->recheckClauses($response);
  295. }
  296. /**
  297. * Recheck missed clauses and assign them to a parent if is the case
  298. *
  299. * @param $clauses
  300. *
  301. * @return array
  302. */
  303. private function recheckClauses($clauses)
  304. {
  305. $checkedClauses = [];
  306. $alreadyManaged = [];
  307. for ($i = 0; $i < count($clauses); $i++) {
  308. if (array_key_exists($i, $alreadyManaged)) {
  309. continue;
  310. }
  311. $data [ $i ] = $clauses[ $i ];
  312. $j = $i + 1;
  313. if (isset($clauses[ $j ]) && $clauses[ $j ][ 'content' ] && $this->hasNumbering($data[ $i ]) && ((! $this->hasNumbering($clauses[ $j ])) || (($this->hasNumbering($clauses[ $j ]) && is_numeric($clauses[ $j ][ 'numbering' ]) && count(array_filter(explode('.',
  314. $clauses[ $j ][ 'numbering' ]))) > 1 && is_numeric($clauses[ $i ][ 'numbering' ]) && count(array_filter(explode('.',
  315. $clauses[ $i ][ 'numbering' ]))) <= 1)))) {
  316. for ($j; $j < count($clauses); $j++) {
  317. if (isset($clauses[ $j ][ 'numbering' ]) && is_numeric($clauses[ $j ][ 'numbering' ]) && count(array_filter(explode('.',
  318. $clauses[ $j ][ 'numbering' ]))) == 1) {
  319. break;
  320. }
  321. $data[ $i ][ 'children' ][] = $clauses[ $j ];
  322. $alreadyManaged[] = $j;
  323. }
  324. }
  325. $alreadyManaged[] = $i;
  326. if ($data[ $i ][ 'content' ]) {
  327. $checkedClauses[] = $data[ $i ];
  328. }
  329. }
  330. return $checkedClauses;
  331. }
  332. /**
  333. * Build the child structure based on the spaces before the text
  334. *
  335. * @param $parent
  336. * @param $child
  337. *
  338. *
  339. * @return mixed
  340. */
  341. private function handlePossibleChild($parent, $child)
  342. {
  343. if (empty($child[ 'content' ])) {
  344. return $parent;
  345. }
  346. if ($this->pdf && ! isset($parent[ 'children' ]) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  347. 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1),
  348. ['}', ')']) || is_numeric(substr(trim($child[ 'content' ]), 0,
  349. 1)) || in_array(substr(trim($child[ 'content' ]), -1),
  350. ['.', ',', ':']) || (! in_array(substr(trim($child[ 'content' ]), -1), [
  351. '.',
  352. ',',
  353. ':'
  354. ])) && $child[ 'spaces' ] > $parent[ 'spaces' ]) && ((in_array(substr(trim($parent[ 'content' ]),
  355. -1), ['}', ')', ',', '"']) || ! in_array(substr(trim($parent[ 'content' ]), -1),
  356. ['.', ':', '!']) || ctype_lower(substr(trim($parent[ 'content' ]), -1))))) {
  357. //dd($parent,$child);
  358. $parent[ 'content' ] .= ' '.$child[ 'content' ];
  359. return $parent;
  360. } elseif ($this->pdf && isset($parent[ 'children' ]) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  361. 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1),
  362. ['}', ')']) || is_numeric(substr(trim($child[ 'content' ]), 0,
  363. 1)) || in_array(substr(trim($child[ 'content' ]), -1),
  364. ['.', ',', ':'])) && ((in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
  365. -1), [
  366. '}',
  367. ')',
  368. ',',
  369. '"'
  370. ]) || ! in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]), -1),
  371. ['.', ':', '!']) || ctype_lower(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
  372. -1))))) {
  373. if (strpos($child[ 'content' ], 'thirty') !== false && $parent[ 'numbering' ] !== '1.') {
  374. $lastParentChild = last($parent[ 'children' ]);
  375. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  376. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  377. return $parent;
  378. }
  379. } elseif ($this->pdf && ! isset($parent[ 'children' ]) && $child[ 'spaces' ] >= $parent[ 'spaces' ] && ! $this->hasNumbering($child)) {
  380. if ($this->hasChild($parent)) {
  381. $lastParentChild = $this->getLastChildForParagraph($parent);
  382. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  383. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  384. } else {
  385. $parent[ 'content' ] .= ' '.$child[ 'content' ];
  386. }
  387. return $parent;
  388. }
  389. if (! isset($parent[ 'children' ])) {
  390. $parent[ 'children' ][] = $child;
  391. return $parent;
  392. }
  393. $lastParentChild = last($parent[ 'children' ]);
  394. if ($this->lastChildIsList($parent) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  395. 1)) || in_array(substr(trim($child[ 'content' ]), -1), [';']) || strpos($child[ 'content' ],
  396. ':') !== false || in_array(trim(substr(trim($child[ 'content' ]), 0, 1)),
  397. ['{', '('])) && ! $this->hasNumbering($child)) {
  398. if (! isset($lastParentChild[ 'children' ])) {
  399. $lastParentChild[ 'children' ][] = $child;
  400. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  401. return $parent;
  402. }
  403. if (isset($lastParentChild[ 'children' ]) && ! in_array(substr(last($lastParentChild[ 'children' ])[ 'content' ],
  404. -1), ['.', ';', ',']) && ! in_array(substr(trim($child[ 'content' ]), 0, 1),
  405. ['(', '{', ':']) && ! $this->hasNumbering($child)) {
  406. $lastParentChild[ 'children' ][ count($lastParentChild[ 'children' ]) - 1 ][ 'content' ] .= ' '.trim($child[ 'content' ]);
  407. } else {
  408. $lastParentChild[ 'children' ][] = $child;
  409. }
  410. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  411. return $parent;
  412. }
  413. if ($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && substr(trim($lastParentChild[ 'content' ]),
  414. -1) == ':' && count(array_filter(str_split($lastParentChild[ 'numbering' ]),
  415. 'is_numeric')) < count(array_filter(str_split($child[ 'numbering' ]), 'is_numeric'))) {
  416. $lastParentChild[ 'children' ][] = $child;
  417. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  418. return $parent;
  419. }
  420. if ($lastParentChild[ 'spaces' ] == $child[ 'spaces' ]) {
  421. if ($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && (in_array(substr(trim($lastParentChild[ 'content' ]),
  422. -1), ['.', ';']) || $this->hasNumbering($child))) {
  423. if (($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && ((int) substr($child[ 'numbering' ],
  424. strrpos($child[ 'numbering' ], '.') + 1) - (int) substr($lastParentChild[ 'numbering' ],
  425. strrpos($lastParentChild[ 'numbering' ],
  426. '.') + 1) == 1)) || (in_array(utf8_encode(substr($lastParentChild[ 'content' ], 0,
  427. 1)), ['â', '"', '{']) && in_array(utf8_encode(substr($child[ 'content' ], 0, 1)),
  428. ['â', '"', '{', '•']))) {
  429. $parent[ 'children' ][] = $child;
  430. } else {
  431. $lastParentChild[ 'children' ][] = $child;
  432. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  433. }
  434. } else {
  435. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  436. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  437. }
  438. } elseif (! $this->hasNumbering($child) && ! in_array(substr(trim($lastParentChild[ 'content' ]), 0, 1),
  439. ['.', ';', '}']) && (ctype_lower(substr(trim($lastParentChild[ 'content' ]),
  440. -1))) || in_array(substr(trim($lastParentChild[ 'content' ]), -1),
  441. [',']) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  442. 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1), ['{', '(', ')']))) {
  443. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  444. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  445. } else {
  446. if ($this->hasChild($parent) && in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
  447. -1), ['.', ';', '}'])) {
  448. $lastParentChild[ 'children' ][] = $child;
  449. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  450. return $parent;
  451. }
  452. $lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
  453. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  454. }
  455. return $parent;
  456. }
  457. /**
  458. * Check if paragraph is a list
  459. *
  460. * @param $paragraph
  461. *
  462. * @return bool
  463. */
  464. private function paragraphIsList($paragraph)
  465. {
  466. if (substr(trim($paragraph[ 'content' ]), -1) == ':') {
  467. return true;
  468. }
  469. return false;
  470. }
  471. /**
  472. * Check if last child from the paragraph is a list
  473. *
  474. * @param $paragraph
  475. *
  476. * @return bool
  477. */
  478. private function lastChildIsList($paragraph)
  479. {
  480. if ($this->hasChild($paragraph)) {
  481. $lastParentChild = last($paragraph[ 'children' ]);
  482. if (substr(trim($lastParentChild[ 'content' ]), -1) == ':') {
  483. return true;
  484. }
  485. }
  486. return false;
  487. }
  488. private function getLastChildForParagraph($paragraph)
  489. {
  490. if ($this->hasChild($paragraph)) {
  491. $lastParentChild = last($paragraph[ 'children' ]);
  492. return $this->getLastChildFromParagraph($lastParentChild);
  493. }
  494. return $paragraph;
  495. }
  496. /**
  497. * Check if a paragraph has any child
  498. *
  499. * @param $paragraph
  500. *
  501. * @return bool
  502. */
  503. private function hasChild($paragraph)
  504. {
  505. if (isset($paragraph[ 'children' ])) {
  506. return true;
  507. }
  508. return false;
  509. }
  510. /**
  511. * Extract numbering from a given paragraph
  512. *
  513. * return false if has no numbering
  514. *
  515. * @param $paragraph
  516. *
  517. * @return false|mixed
  518. */
  519. private function getNumbering($paragraph)
  520. {
  521. if (isset($paragraph)) {
  522. $paragraphContent = trim($paragraph);
  523. if (in_array(substr($paragraphContent, 0, 1), ['(', '{'])) {
  524. return false;
  525. }
  526. if ($this->pdf && isset($paragraph) && strpos($paragraphContent,
  527. 'Page') !== false && strpos($paragraphContent, 'of') !== false) {
  528. return false;
  529. }
  530. preg_match('/^([-+]?\d*\.?\d+?\d*\.?\d+|\d+(\.?)*)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
  531. substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
  532. preg_replace('/\)/', '.', preg_replace("/\{.+/", "", trim($paragraphContent))))), 0, 6)),
  533. $paragraphNumbering);
  534. if (count($paragraphNumbering) && (in_array(substr($paragraphContent, strlen($paragraphNumbering[ 0 ]), 1),
  535. [' ', "\t", '.', ')']) || in_array(substr($paragraphNumbering[ 0 ], -1),
  536. [' ', "\t", '.', ')']) || is_numeric($paragraphNumbering[ 0 ]))) {
  537. $locationOfNumbering = strpos($paragraphContent,$paragraphNumbering[0]);
  538. if(substr($paragraphContent,$locationOfNumbering-1,1)=='(' &&substr($paragraphContent,$locationOfNumbering+1,1)==')'){
  539. return false;
  540. }
  541. return str_replace('..', '.', $paragraphNumbering[ 0 ]);
  542. }
  543. return false;
  544. }
  545. return false;
  546. }
  547. /**
  548. * Check if a paragraph is between clauses
  549. *
  550. * @param $first
  551. * @param $paragraph
  552. * @param $list
  553. *
  554. * @return bool
  555. */
  556. private function paragraphBetweenClauses($first, $paragraph, $list)
  557. {
  558. if ($this->hasNumbering($first) && ! isset($paragraph[ 'numbering' ])) {
  559. $firstNumberingString = $this->getLastChildFromParagraph($first);
  560. if (isset($firstNumberingString[ 'numbering' ])) {
  561. $firstNumbering = last(array_filter(explode('.', $firstNumberingString[ 'numbering' ])));
  562. foreach ($list as $lastParagraph) {
  563. if ($lastParagraphNumberingString = $this->getNumbering($lastParagraph)) {
  564. $lastParagraphNumbering = last(array_filter(explode('.', $lastParagraphNumberingString)));
  565. if ($lastParagraphNumbering - $firstNumbering == 1 && substr_count($firstNumberingString[ 'numbering' ],
  566. '.') == substr_count($lastParagraphNumberingString, '.')) {
  567. return true;
  568. } elseif (substr_count($firstNumberingString[ 'numbering' ],
  569. '.') > substr_count($lastParagraphNumberingString, '.')) {
  570. return true;
  571. }
  572. return false;
  573. }
  574. }
  575. }
  576. return false;
  577. }
  578. return false;
  579. }
  580. private function getLastChildFromParagraph($paragraph)
  581. {
  582. if (isset($paragraph[ 'children' ])) {
  583. return $this->getLastChildFromParagraph(last($paragraph[ 'children' ]));
  584. }
  585. return $paragraph;
  586. }
  587. private function appendToLastChildFromParagraph($paragraph, $append)
  588. {
  589. if (isset($paragraph[ 'children' ])) {
  590. return $this->getLastChildFromParagraph(last($paragraph[ 'children' ]));
  591. }
  592. $paragraph[ 'content' ] .= ' '.$append[ 'content' ];
  593. return $paragraph;
  594. }
  595. /**
  596. * Check if a paragraph has numbering
  597. *
  598. * @param $paragraph
  599. *
  600. * @return bool
  601. */
  602. private function hasNumbering($paragraph)
  603. {
  604. if (isset($paragraph[ 'numbering' ]) && $paragraph[ 'numbering' ]) {
  605. return true;
  606. }
  607. return false;
  608. }
  609. /**
  610. * Uppercase all values in the array
  611. *
  612. * @param $value
  613. *
  614. * @return array|string
  615. */
  616. private function nestedUppercase($value)
  617. {
  618. if (is_array($value)) {
  619. return array_map([$this, 'nestedUppercase'], $value);
  620. }
  621. //remove unwanted chars
  622. return strtoupper(str_replace(['.'], '', $value));
  623. }
  624. }