Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

773 lines
29 KiB

3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
3 years ago
  1. <?php
  2. namespace App\Parser;
  3. use Illuminate\Support\Facades\Log;
  4. class ParseTextArray
  5. {
  6. /**
  7. * @var array
  8. */
  9. private $breakPoints = [
  10. 'TERMS OF THE {P1_Pros}',
  11. 'TERMS AND CONDITIONS',
  12. 'BACKGROUND',
  13. 'OPERATIVE PROVISIONS',
  14. 'Products and/or Services',
  15. 'PAYMENT',
  16. 'GRANT OF LICENCE',
  17. 'TERM OF LICENCE AGREEMENT',
  18. 'ROYALTY',
  19. 'PAYMENT',
  20. 'PERFORMANCE TARGETS',
  21. 'STATIONERY',
  22. 'QUALITY CONTROL',
  23. 'THE DISTRIBUTOR\'S OBLIGATIONS',
  24. 'NON SOLICITATION',
  25. 'SALE OF BUSINESS',
  26. 'TERMINATION OF AGREEMENT',
  27. 'CONDITIONS FOLLOWING TERMINATION',
  28. 'RESTRAINT',
  29. 'TIME OF ESSENCE AND NOTICES',
  30. 'INTERPRETATION',
  31. 'ARBITRATION',
  32. 'DOMICILIUM AND REGISTERED OFFICE',
  33. 'USE OF TRADE MARKS, TRADE NAME, GOODWILL AND KNOW-HOW',
  34. 'GENERAL',
  35. 'DESCRIPTION OF {P2_NAME} INFORMATION',
  36. 'PAYMENT OF FEES',
  37. 'SUPPLIER\'S STATUS',
  38. 'SUPPLIER\’S OBLIGATIONS',
  39. 'DEFINITIONS AND INTERPRETATION',
  40. 'DEFINITIONS',
  41. 'CONFIDENTIALITY',
  42. 'TERMINATION',
  43. 'RESTRICTIVE COVENANTS AND INTELLECTUAL PROPERTY',
  44. 'DETAILS AND IDENTITY OF CONSULTANT',
  45. 'ANTI-BRIBERY',
  46. 'ASSIGNMENT SCHEDULE',
  47. 'SCHEDULE 1',
  48. '{P1_NAME}\'S LIABILITY',
  49. 'DURATION OF AGREEMENT AND SUPPLY',
  50. 'SUPPLY OF HARDWARE',
  51. 'SUPPLY OF SOFTWARE AND DOCUMENTATION',
  52. 'SUPPLY OF SUPPORT SERVICES',
  53. 'INTELLECTUAL PROPERTY RIGHTS',
  54. 'THE CONTRACT',
  55. '{P1_NAME}\U2019S LIABILITY',
  56. 'UPDATES',
  57. 'TERMS OF THE {P1_NAME} PRODUCTS.',
  58. 'CUSTOMER RESPONSIBILITIES',
  59. 'EXHIBIT A',
  60. 'EXHIBIT A-1',
  61. 'EXHIBIT A-2',
  62. 'WARRANTIES',
  63. 'EXIT, TERMINATION AND SUSPENSION',
  64. 'EXHIBIT B',
  65. 'EXHIBIT B-1',
  66. 'EXHIBIT B-2',
  67. 'COUNTERPARTS',
  68. 'LICENSE GRANT',
  69. 'INDEMNIFICATION BY CUSTOMER',
  70. 'TERMS OF THE {P1_NAME} PRODUCTS',
  71. 'TERMS OF CLOUD SERVICE',
  72. 'INDEMNIFICATION BY CUSTOMER',
  73. 'TERMINATION',
  74. 'TERMS OF THE {P1_PROS}',
  75. 'SUPPORT',
  76. 'SUB CONTRACTING AND THIRD PARTY RECOMMENDATIONS',
  77. 'LICENCE AND ACCESS TO SOFTWARE AND HARDWARE',
  78. 'DECLARATION OF NON-LIAISON AND ANTI-CORRUPTION COMMITMENT',
  79. '{P1_NAME}\'S DUTIES',
  80. 'ana are mere',
  81. 'definitions',
  82. 'fees',
  83. 'ENGAGEMENT',
  84. 'DUTIES',
  85. 'TERMINATION',
  86. 'STATEMENTS',
  87. 'CONFIDENTIALITY',
  88. 'Human rights',
  89. 'Labour',
  90. 'Environment',
  91. 'Anti-corruption',
  92. 'Services',
  93. 'Scope of the Agreement',
  94. 'Staffing Levels for Services','Indemnification'
  95. ];
  96. /**
  97. * @var bool
  98. */
  99. private $pdf;
  100. /**
  101. * ParseTextArray constructor.
  102. *
  103. * @param bool $pdf
  104. */
  105. public function __construct($pdf = false)
  106. {
  107. $this->breakPoints = $this->nestedUppercase($this->breakPoints);
  108. $this->pdf = $pdf;
  109. }
  110. public function fromFile($filePath)
  111. {
  112. if (file_exists($filePath)) {
  113. $fileContent = file_get_contents($filePath);
  114. return $this->buildTheStructure(array_filter(explode(PHP_EOL, $fileContent)));
  115. }
  116. Log::error('The given file does not exists!');
  117. return '';
  118. }
  119. /**
  120. * Build the child structure and extract relevant data from the text content
  121. *
  122. *
  123. * @param $textAsArray
  124. *
  125. * @return array
  126. */
  127. private function buildTheStructure($textAsArray)
  128. {
  129. $textAsArray = array_values($textAsArray);
  130. $response = [];
  131. $alreadyHandled = [];
  132. $countData = count($textAsArray);
  133. for ($i = 0; $i < $countData; $i++) {
  134. if (array_key_exists($i, $alreadyHandled)) {
  135. continue;
  136. }
  137. // Extract the content and count the number of the empty spaces from the beginning.
  138. $data[ $i ] = [
  139. 'content' => trim($textAsArray[ $i ]),
  140. 'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ]))
  141. ];
  142. //Remove numbering from the paragraph content
  143. if ($numbering = $this->getNumbering($textAsArray[ $i ])) {
  144. $data[ $i ][ 'numbering' ] = $numbering;
  145. $data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.'));
  146. }
  147. if (
  148. $this->pdf &&
  149. strpos($textAsArray[ $i ], 'Page') !== false &&
  150. strpos($textAsArray[ $i ], 'of') !== false
  151. ) {
  152. $alreadyHandled[] = $i;
  153. break;
  154. }
  155. $j = $i + 1;
  156. if (isset($textAsArray[ $j ])) {
  157. for ($j; $j < $countData; $j++) {
  158. if (array_key_exists($j, $alreadyHandled)) {
  159. continue;
  160. }
  161. if (
  162. $this->pdf &&
  163. isset($textAsArray[ $j ]) &&
  164. strpos($textAsArray[ $j ], 'Page') !== false &&
  165. strpos($textAsArray[ $j ], 'of') !== false
  166. ) {
  167. $alreadyHandled[] = $j;
  168. continue;
  169. }
  170. // Extract the content and count the number of the empty spaces from the beginning
  171. $data[ $j ] = [
  172. 'content' => trim($textAsArray[ $j ]),
  173. 'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ]))
  174. ];
  175. // Remove numbering from the paragraph content
  176. if ($numbering = $this->getNumbering($textAsArray[ $j ])) {
  177. $data[ $j ][ 'numbering' ] = $numbering;
  178. $data[ $j ][ 'content' ] = trim(
  179. ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), '.')
  180. );
  181. }
  182. // Break if both have numbering and the space is equal
  183. if (
  184. $data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] &&
  185. $this->hasNumbering($data[ $j ]) &&
  186. $this->hasNumbering($data[ $i ]) &&
  187. substr_count($data[ $i ][ 'numbering' ], '.') == substr_count($data[ $j ][ 'numbering' ], '.') &&
  188. count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric'))) {
  189. break;
  190. }
  191. if (
  192. $this->hasNumbering($data[ $j ]) &&
  193. ! $this->hasNumbering($data[ $i ]) &&
  194. ! $data[ $i ][ 'spaces' ] &&
  195. $data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] &&
  196. ! in_array(substr($data[ $i ][ 'content' ], -1), [':'])
  197. ) {
  198. break;
  199. }
  200. if (
  201. $this->hasNumbering($data[ $j ]) &&
  202. $this->hasNumbering($data[ $i ]) &&
  203. ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
  204. ) {
  205. break;
  206. }
  207. if (
  208. $this->hasNumbering($data[ $j ]) &&
  209. $this->hasNumbering($data[ $i ]) &&
  210. ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
  211. ) {
  212. break;
  213. }
  214. // Hardcoded breakpoints
  215. if (
  216. $this->hasNumbering($data[ $j ]) &&
  217. in_array(strtoupper(str_replace(['.', "\t", ""], '', $data[ $j ][ 'content' ])), $this->breakPoints)
  218. ) {
  219. break;
  220. }
  221. // Hardcoded "Schedule break"
  222. if (
  223. ! $this->hasNumbering($data[ $j ]) &&
  224. strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), 0, 10), 'schedule') !== false
  225. ) {
  226. break;
  227. }
  228. if (
  229. ! $this->hasNumbering($data[ $j ]) &&
  230. strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), 'Exhibit') !== false &&
  231. ! in_array(substr(trim($data[ $j ][ 'content' ]), -1), ['.'])
  232. ) {
  233. break;
  234. }
  235. if (strpos(substr(trim(strtolower($data[ $j ][ 'content' ])), 0, 15), 'attachment') !== false) {
  236. break;
  237. }
  238. if ($this->hasNumbering($data[ $j ]) && $this->hasChild($data[ $i ])) {
  239. if ($this->hasNumbering(last($data[ $i ][ 'children' ])) && (is_numeric(last($data[ $i ][ 'children' ])[ 'numbering' ]) && strpos(last($data[ $i ][ 'children' ])[ 'numbering' ],
  240. ".") !== false) && (is_numeric($data[ $j ][ 'numbering' ]) && strpos($data[ $j ][ 'numbering' ],
  241. ".") === false)) {
  242. break;
  243. }
  244. }
  245. if ($data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && strlen($data[ $i ][ 'content' ]) && strlen($data[ $j ][ 'content' ])) {
  246. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  247. $alreadyHandled[] = $j;
  248. } elseif (isset($textAsArray[ $j + 1 ]) && $this->paragraphBetweenClauses($data[ $i ], $data[ $j ],
  249. array_slice($textAsArray, $j + 1))) {
  250. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  251. $alreadyHandled[] = $j;
  252. } elseif ($this->hasChild($data[ $i ]) && $this->lastChildIsList($data[ $i ]) && ($data[ $i ][ 'spaces' ] == 0 || $data[ $i ][ 'spaces' ] > $data[ $j ][ 'spaces' ])) {
  253. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  254. $alreadyHandled[] = $j;
  255. } elseif ($data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && (substr_count($data[ $i ][ 'numbering' ],
  256. '.') < substr_count($data[ $j ][ 'numbering' ],
  257. '.') || count(array_filter(str_split($data[ $i ][ 'numbering' ]),
  258. 'is_numeric')) < count(array_filter(str_split($data[ $j ][ 'numbering' ]),
  259. 'is_numeric')))) {
  260. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  261. $alreadyHandled[] = $j;
  262. } else {
  263. if ($this->paragraphIsList($data[ $i ]) && (ctype_lower(substr($data[ $j ][ 'content' ], 0,
  264. 1)) || in_array(substr($data[ $j ][ 'content' ], 0, 1), ['{', '•']))) {
  265. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  266. $alreadyHandled[] = $j;
  267. } elseif ($this->hasNumbering($data[ $i ]) && $this->hasNumbering($data[ $j ]) && is_numeric($data[ $j ][ 'numbering' ]) && strpos($data[ $j ][ 'numbering' ],
  268. ".") !== false && strpos($data[ $i ][ 'numbering' ],
  269. ".") === false && ! is_int($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ])) {
  270. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  271. $alreadyHandled[] = $j;
  272. } elseif ($this->hasChild($data[ $i ]) && ($data[ $j ][ 'spaces' ] == $this->getLastChildForParagraph($data[ $i ])[ 'spaces' ])) {
  273. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  274. $alreadyHandled[] = $j;
  275. } elseif (strpos(strtolower($data[ $i ][ 'content' ]),
  276. 'definitions and') !== false && in_array(utf8_encode(substr($data[ $j ][ 'content' ], 0,
  277. 1)), ['â', '"'])) {
  278. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  279. $alreadyHandled[] = $j;
  280. } elseif ($this->hasChild($data[ $i ]) && $this->paragraphIsList($this->getLastChildFromParagraph($data[ $i ]))) {
  281. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  282. $alreadyHandled[] = $j;
  283. } elseif (($this->hasChild($data[ $i ]) || $data[ $i ][ 'spaces' ] == $data[ $j ][ 'spaces' ]) && ! $this->hasNumbering($this->getLastChildForParagraph($data[ $i ])) && ! $this->hasNumbering($data[ $j ])) {
  284. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  285. $alreadyHandled[] = $j;
  286. } else {
  287. break;
  288. }
  289. }
  290. }
  291. }
  292. if (strlen($data[ $i ][ 'content' ])) {
  293. $response[] = $data[ $i ];
  294. }
  295. $alreadyHandled[] = $i;
  296. }
  297. return $this->recheckClauses($response);
  298. }
  299. /**
  300. * Recheck missed clauses and assign them to a parent if is the case
  301. *
  302. * @param $clauses
  303. *
  304. * @return array
  305. */
  306. private function recheckClauses($clauses)
  307. {
  308. $checkedClauses = [];
  309. $alreadyManaged = [];
  310. for ($i = 0; $i < count($clauses); $i++) {
  311. if (array_key_exists($i, $alreadyManaged)) {
  312. continue;
  313. }
  314. $data [ $i ] = $clauses[ $i ];
  315. $j = $i + 1;
  316. if (
  317. isset($clauses[ $j ]) &&
  318. $clauses[ $j ][ 'content' ] &&
  319. $this->hasNumbering($data[ $i ]) &&
  320. (
  321. (! $this->hasNumbering($clauses[ $j ])) ||
  322. (
  323. $this->hasNumbering($clauses[ $j ]) &&
  324. is_numeric($clauses[ $j ][ 'numbering' ]) &&
  325. count(array_filter(explode('.', $clauses[ $j ][ 'numbering' ]))) > 1 &&
  326. is_numeric($clauses[ $i ][ 'numbering' ]) &&
  327. count(array_filter(explode('.', $clauses[ $i ][ 'numbering' ]))) <= 1
  328. )
  329. )
  330. ) {
  331. for ($j; $j < count($clauses); $j++) {
  332. if (
  333. isset($clauses[ $j ][ 'numbering' ]) &&
  334. is_numeric($clauses[ $j ][ 'numbering' ]) &&
  335. count(array_filter(explode('.', $clauses[ $j ][ 'numbering' ]))) == 1
  336. ) {
  337. break;
  338. }
  339. $data[ $i ][ 'children' ][] = $clauses[ $j ];
  340. $alreadyManaged[] = $j;
  341. }
  342. }
  343. $alreadyManaged[] = $i;
  344. if ($data[ $i ][ 'content' ]) {
  345. $checkedClauses[] = $data[ $i ];
  346. }
  347. }
  348. return $checkedClauses;
  349. }
  350. /**
  351. * Build the child structure based on the spaces before the text
  352. *
  353. * @param $parent
  354. * @param $child
  355. *
  356. *
  357. * @return mixed
  358. */
  359. private function handlePossibleChild($parent, $child)
  360. {
  361. if (empty($child[ 'content' ])) {
  362. return $parent;
  363. }
  364. if ($this->pdf && ! isset($parent[ 'children' ]) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  365. 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1),
  366. ['}', ')']) || is_numeric(substr(trim($child[ 'content' ]), 0,
  367. 1)) || in_array(substr(trim($child[ 'content' ]), -1),
  368. ['.', ',', ':']) || (! in_array(substr(trim($child[ 'content' ]), -1), [
  369. '.',
  370. ',',
  371. ':'
  372. ])) && $child[ 'spaces' ] > $parent[ 'spaces' ]) && ((in_array(substr(trim($parent[ 'content' ]),
  373. -1), ['}', ')', ',', '"']) || ! in_array(substr(trim($parent[ 'content' ]), -1),
  374. ['.', ':', '!']) || ctype_lower(substr(trim($parent[ 'content' ]), -1))))) {
  375. //dd($parent,$child);
  376. $parent[ 'content' ] .= ' '.$child[ 'content' ];
  377. return $parent;
  378. } elseif ($this->pdf && isset($parent[ 'children' ]) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  379. 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1),
  380. ['}', ')']) || is_numeric(substr(trim($child[ 'content' ]), 0,
  381. 1)) || in_array(substr(trim($child[ 'content' ]), -1),
  382. ['.', ',', ':'])) && ((in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
  383. -1), [
  384. '}',
  385. ')',
  386. ',',
  387. '"'
  388. ]) || ! in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]), -1),
  389. ['.', ':', '!']) || ctype_lower(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
  390. -1))))) {
  391. if (strpos($child[ 'content' ], 'thirty') !== false && $parent[ 'numbering' ] !== '1.') {
  392. $lastParentChild = last($parent[ 'children' ]);
  393. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  394. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  395. return $parent;
  396. }
  397. } elseif ($this->pdf && ! isset($parent[ 'children' ]) && $child[ 'spaces' ] >= $parent[ 'spaces' ] && ! $this->hasNumbering($child)) {
  398. if ($this->hasChild($parent)) {
  399. $lastParentChild = $this->getLastChildForParagraph($parent);
  400. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  401. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  402. } else {
  403. $parent[ 'content' ] .= ' '.$child[ 'content' ];
  404. }
  405. return $parent;
  406. }
  407. if (! isset($parent[ 'children' ])) {
  408. $parent[ 'children' ][] = $child;
  409. return $parent;
  410. }
  411. $lastParentChild = last($parent[ 'children' ]);
  412. if ($this->lastChildIsList($parent) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  413. 1)) || in_array(substr(trim($child[ 'content' ]), -1), [';']) || strpos($child[ 'content' ],
  414. ':') !== false || in_array(trim(substr(trim($child[ 'content' ]), 0, 1)),
  415. ['{', '('])) && ! $this->hasNumbering($child)) {
  416. if (! isset($lastParentChild[ 'children' ])) {
  417. $lastParentChild[ 'children' ][] = $child;
  418. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  419. return $parent;
  420. }
  421. if (isset($lastParentChild[ 'children' ]) && ! in_array(substr(last($lastParentChild[ 'children' ])[ 'content' ],
  422. -1), ['.', ';', ',']) && ! in_array(substr(trim($child[ 'content' ]), 0, 1),
  423. ['(', '{', ':']) && ! $this->hasNumbering($child)) {
  424. $lastParentChild[ 'children' ][ count($lastParentChild[ 'children' ]) - 1 ][ 'content' ] .= ' '.trim($child[ 'content' ]);
  425. } else {
  426. $lastParentChild[ 'children' ][] = $child;
  427. }
  428. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  429. return $parent;
  430. }
  431. if ($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && substr(trim($lastParentChild[ 'content' ]),
  432. -1) == ':' && count(array_filter(str_split($lastParentChild[ 'numbering' ]),
  433. 'is_numeric')) < count(array_filter(str_split($child[ 'numbering' ]), 'is_numeric'))) {
  434. $lastParentChild[ 'children' ][] = $child;
  435. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  436. return $parent;
  437. }
  438. if ($lastParentChild[ 'spaces' ] == $child[ 'spaces' ]) {
  439. if ($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && (in_array(substr(trim($lastParentChild[ 'content' ]),
  440. -1), ['.', ';']) || $this->hasNumbering($child))) {
  441. if (($this->hasNumbering($lastParentChild) && $this->hasNumbering($child) && ((int) substr($child[ 'numbering' ],
  442. strrpos($child[ 'numbering' ], '.') + 1) - (int) substr($lastParentChild[ 'numbering' ],
  443. strrpos($lastParentChild[ 'numbering' ],
  444. '.') + 1) == 1)) || (in_array(utf8_encode(substr($lastParentChild[ 'content' ], 0,
  445. 1)), ['â', '"', '{']) && in_array(utf8_encode(substr($child[ 'content' ], 0, 1)),
  446. ['â', '"', '{', '•']))) {
  447. $parent[ 'children' ][] = $child;
  448. } else {
  449. $lastParentChild[ 'children' ][] = $child;
  450. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  451. }
  452. } else {
  453. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  454. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  455. }
  456. } elseif (! $this->hasNumbering($child) && ! in_array(substr(trim($lastParentChild[ 'content' ]), 0, 1),
  457. ['.', ';', '}']) && (ctype_lower(substr(trim($lastParentChild[ 'content' ]),
  458. -1))) || in_array(substr(trim($lastParentChild[ 'content' ]), -1),
  459. [',']) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  460. 1)) || in_array(substr(trim($child[ 'content' ]), 0, 1), ['{', '(', ')']))) {
  461. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  462. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  463. } else {
  464. if ($this->hasChild($parent) && in_array(substr(trim($this->getLastChildForParagraph($parent)[ 'content' ]),
  465. -1), ['.', ';', '}'])) {
  466. $lastParentChild[ 'children' ][] = $child;
  467. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  468. return $parent;
  469. }
  470. $lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
  471. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  472. }
  473. return $parent;
  474. }
  475. /**
  476. * Check if paragraph is a list
  477. *
  478. * @param $paragraph
  479. *
  480. * @return bool
  481. */
  482. private function paragraphIsList($paragraph)
  483. {
  484. return substr(trim($paragraph[ 'content' ]), -1) === ':';
  485. }
  486. /**
  487. * Check if last child from the paragraph is a list
  488. *
  489. * @param $paragraph
  490. *
  491. * @return bool
  492. */
  493. private function lastChildIsList($paragraph)
  494. {
  495. if ($this->hasChild($paragraph)) {
  496. $lastParentChild = last($paragraph[ 'children' ]);
  497. if (substr(trim($lastParentChild[ 'content' ]), -1) == ':') {
  498. return true;
  499. }
  500. }
  501. return false;
  502. }
  503. private function getLastChildForParagraph($paragraph)
  504. {
  505. if ($this->hasChild($paragraph)) {
  506. $lastParentChild = last($paragraph[ 'children' ]);
  507. return $this->getLastChildFromParagraph($lastParentChild);
  508. }
  509. return $paragraph;
  510. }
  511. /**
  512. * Check if a paragraph has any child
  513. *
  514. * @param $paragraph
  515. *
  516. * @return bool
  517. */
  518. private function hasChild($paragraph)
  519. {
  520. if (isset($paragraph[ 'children' ])) {
  521. return true;
  522. }
  523. return false;
  524. }
  525. /**
  526. * Extract numbering from a given paragraph
  527. *
  528. * return false if has no numbering
  529. *
  530. * @param $paragraph
  531. *
  532. * @return false|mixed
  533. */
  534. private function getNumbering($paragraph)
  535. {
  536. if (isset($paragraph)) {
  537. $paragraphContent = trim($paragraph);
  538. if (in_array(substr($paragraphContent, 0, 1), ['(', '{'])) {
  539. return false;
  540. }
  541. if ($this->pdf && isset($paragraph) && strpos($paragraphContent,
  542. 'Page') !== false && strpos($paragraphContent, 'of') !== false) {
  543. return false;
  544. }
  545. preg_match('/^([-+]?\d*\.?\d+?\d*\.?\d+|\d+(\.?)*)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
  546. substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
  547. preg_replace('/\)/', '.', preg_replace("/\{.+/", "", trim($paragraphContent))))), 0, 6)),
  548. $paragraphNumbering);
  549. if (count($paragraphNumbering) && (in_array(substr($paragraphContent, strlen($paragraphNumbering[ 0 ]), 1),
  550. [' ', "\t", '.', ')']) || in_array(substr($paragraphNumbering[ 0 ], -1),
  551. [' ', "\t", '.', ')']) || is_numeric($paragraphNumbering[ 0 ]))) {
  552. $locationOfNumbering = strpos($paragraphContent,$paragraphNumbering[0]);
  553. if(substr($paragraphContent,$locationOfNumbering-1,1)=='(' &&substr($paragraphContent,$locationOfNumbering+1,1)==')'){
  554. return false;
  555. }
  556. return str_replace('..', '.', $paragraphNumbering[ 0 ]);
  557. }
  558. return false;
  559. }
  560. return false;
  561. }
  562. /**
  563. * Check if a paragraph is between clauses
  564. *
  565. * @param $first
  566. * @param $paragraph
  567. * @param $list
  568. *
  569. * @return bool
  570. */
  571. private function paragraphBetweenClauses($first, $paragraph, $list)
  572. {
  573. if ($this->hasNumbering($first) && ! isset($paragraph[ 'numbering' ])) {
  574. $firstNumberingString = $this->getLastChildFromParagraph($first);
  575. if (isset($firstNumberingString[ 'numbering' ])) {
  576. $firstNumbering = last(array_filter(explode('.', $firstNumberingString[ 'numbering' ])));
  577. foreach ($list as $lastParagraph) {
  578. if ($lastParagraphNumberingString = $this->getNumbering($lastParagraph)) {
  579. $lastParagraphNumbering = last(array_filter(explode('.', $lastParagraphNumberingString)));
  580. if ($lastParagraphNumbering - $firstNumbering == 1 && substr_count($firstNumberingString[ 'numbering' ],
  581. '.') == substr_count($lastParagraphNumberingString, '.')) {
  582. return true;
  583. } elseif (substr_count($firstNumberingString[ 'numbering' ],
  584. '.') > substr_count($lastParagraphNumberingString, '.')) {
  585. return true;
  586. }
  587. return false;
  588. }
  589. }
  590. }
  591. return false;
  592. }
  593. return false;
  594. }
  595. private function getLastChildFromParagraph($paragraph)
  596. {
  597. if (isset($paragraph[ 'children' ])) {
  598. return $this->getLastChildFromParagraph(last($paragraph[ 'children' ]));
  599. }
  600. return $paragraph;
  601. }
  602. private function appendToLastChildFromParagraph($paragraph, $append)
  603. {
  604. if (isset($paragraph[ 'children' ])) {
  605. return $this->getLastChildFromParagraph(last($paragraph[ 'children' ]));
  606. }
  607. $paragraph[ 'content' ] .= ' '.$append[ 'content' ];
  608. return $paragraph;
  609. }
  610. /**
  611. * Check if a paragraph has numbering
  612. *
  613. * @param $paragraph
  614. *
  615. * @return bool
  616. */
  617. private function hasNumbering($paragraph)
  618. {
  619. if (isset($paragraph[ 'numbering' ]) && $paragraph[ 'numbering' ]) {
  620. return true;
  621. }
  622. return false;
  623. }
  624. /**
  625. * Uppercase all values in the array
  626. *
  627. * @param $value
  628. *
  629. * @return array|string
  630. */
  631. private function nestedUppercase($value)
  632. {
  633. if (is_array($value)) {
  634. return array_map([$this, 'nestedUppercase'], $value);
  635. }
  636. //remove unwanted chars
  637. return strtoupper(str_replace(['.'], '', $value));
  638. }
  639. }