Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

670 lines
32 KiB

  1. <?php
  2. namespace App\Parser;
  3. use Illuminate\Support\Facades\Log;
  4. class ParseHtmlArray
  5. {
  6. public function fromFile($filePath)
  7. {
  8. if (file_exists($filePath)) {
  9. $fileContent = file_get_contents($filePath);
  10. $fileContent = str_replace('},
  11. ]', "}
  12. ]", $fileContent);
  13. return $this->handle(json_decode($fileContent,true));
  14. } else {
  15. Log::error('The given file dose not exists!');
  16. }
  17. }
  18. public function handle($docxAsHtmlArray)
  19. {
  20. $response=[];
  21. foreach ($docxAsHtmlArray as $i => $array) {
  22. $response = array_merge($response, $this->handleTestHtml($array));
  23. }
  24. return $this->buildTheStructure($response);
  25. }
  26. private function buildTheStructure($data)
  27. {
  28. $response = [];
  29. $alreadyHandled = [];
  30. $numbers = [];
  31. for ($i = 0; $i < count($data); $i++) {
  32. if (array_key_exists($i, $alreadyHandled)) {
  33. continue;
  34. }
  35. $parent = $data[ $i ];
  36. //get numbering from first 10 chars of the string
  37. preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
  38. substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '', preg_replace('/\)/', '.',
  39. preg_replace("/\{.+/", "", html_entity_decode($data[ $i ][ 'content' ]))))), 0, 5)),
  40. $parentNumbering);
  41. if ($parentNumbering && count($numbers) == 0 && last($parentNumbering) < 5) {
  42. $numbers[] = $parentNumbering[ 0 ];
  43. $data[ $i ][ 'numbering' ] = rtrim($parentNumbering[ 0 ], '.');
  44. } elseif ($parentNumbering && count($numbers) > 0 && $parentNumbering[ 0 ] >= last($numbers)) {
  45. $numbers[] = $parentNumbering[ 0 ];
  46. $data[ $i ][ 'numbering' ] = rtrim($parentNumbering[ 0 ], '.');
  47. }
  48. //check if string starts with bold
  49. //check if number of bolds equals to 1
  50. //check if not empty html and contains words
  51. if ((strpos($parent[ 'content' ], "<b>") === 0 || (substr_count($parent[ 'content' ],
  52. "<b>") == 1 || $parentNumbering) && strlen(trim(strip_tags($parent[ 'content' ]))) > 0) || (str_word_count(preg_replace('/[A-Za-z]{4,}/',
  53. '', strip_tags($data[ $i ][ 'content' ]))) < 2)) {
  54. $childNumbers = [];
  55. $j = $i + 1;
  56. //check if data exists
  57. if (isset($data[ $j ]) && strlen($data[ $j ][ 'content' ])) {
  58. for ($j; $j < count($data); $j++) {
  59. if ($data[ $j ][ 'content' ] == '\u00a0') {
  60. $alreadyHandled[] = $j;
  61. }
  62. if (array_key_exists($j, $alreadyHandled)) {
  63. continue;
  64. }
  65. $child = $data[ $j ];
  66. preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/',
  67. substr(trim(urldecode(str_replace(['<b>', '</b>'], '',
  68. strip_tags($data[ $j ][ 'content' ])))), 0, 5), $childNumbering);
  69. if ($childNumbering && ! preg_match("/[a-z]/i", rtrim(trim($childNumbering[ 0 ])))) {
  70. if ($childNumbering && count($childNumbers) == 0 && trim($childNumbering[ 0 ]) < 5) {
  71. $childNumbers[] = trim($childNumbering[ 0 ]);
  72. $data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
  73. } elseif ($childNumbering && count($childNumbers) > 0 && trim($childNumbering[ 0 ]) >= last($childNumbers)) {
  74. $childNumbers[] = trim($childNumbering[ 0 ]);
  75. $data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
  76. } elseif ($childNumbering && trim($childNumbering[ 0 ]) < 100) {
  77. $childNumbers[] = trim($childNumbering[ 0 ]);
  78. $data[ $j ][ 'numbering' ] = rtrim(trim($childNumbering[ 0 ]), '.');
  79. }
  80. }
  81. if (empty(trim($data[ $i ][ 'content' ])) && isset($data[ $j ][ 'numbering' ])) {
  82. break;
  83. }
  84. $breakPoints = array_change_key_case([
  85. 'TERMS OF THE {P1_Pros}',
  86. 'TERMS AND CONDITIONS',
  87. 'BACKGROUND',
  88. 'OPERATIVE PROVISIONS',
  89. 'Products and/or Services',
  90. 'PAYMENT',
  91. 'GRANT OF LICENCE',
  92. 'TERM OF LICENCE AGREEMENT',
  93. 'ROYALTY',
  94. 'PAYMENT',
  95. 'PERFORMANCE TARGETS',
  96. 'STATIONERY',
  97. 'QUALITY CONTROL',
  98. 'THE DISTRIBUTOR\'S OBLIGATIONS',
  99. 'NON SOLICITATION',
  100. 'SALE OF BUSINESS',
  101. 'TERMINATION OF AGREEMENT',
  102. 'CONDITIONS FOLLOWING TERMINATION',
  103. 'RESTRAINT',
  104. 'TIME OF ESSENCE AND NOTICES',
  105. 'INTERPRETATION',
  106. 'ARBITRATION',
  107. 'DOMICILIUM AND REGISTERED OFFICE',
  108. 'USE OF TRADE MARKS, TRADE NAME, GOODWILL AND KNOW-HOW',
  109. 'GENERAL',
  110. 'DESCRIPTION OF {P2_NAME} INFORMATION',
  111. 'PAYMENT OF FEES',
  112. 'SUPPLIER\'S STATUS',
  113. 'SUPPLIER\’S OBLIGATIONS',
  114. 'DEFINITIONS AND INTERPRETATION',
  115. 'DEFINITIONS',
  116. 'CONFIDENTIALITY',
  117. 'TERMINATION',
  118. 'RESTRICTIVE COVENANTS AND INTELLECTUAL PROPERTY',
  119. 'DETAILS AND IDENTITY OF CONSULTANT',
  120. 'ANTI-BRIBERY',
  121. 'ASSIGNMENT SCHEDULE',
  122. 'SCHEDULE 1',
  123. '{P1_NAME}\'S LIABILITY',
  124. 'DURATION OF AGREEMENT AND SUPPLY',
  125. 'SUPPLY OF HARDWARE',
  126. 'SUPPLY OF SOFTWARE AND DOCUMENTATION',
  127. 'SUPPLY OF SUPPORT SERVICES',
  128. 'INTELLECTUAL PROPERTY RIGHTS',
  129. 'THE CONTRACT',
  130. '{P1_NAME}\U2019S LIABILITY',
  131. 'UPDATES',
  132. 'TERMS OF THE {P1_NAME} PRODUCTS.',
  133. 'CUSTOMER RESPONSIBILITIES',
  134. 'EXHIBIT A',
  135. 'EXHIBIT A-1',
  136. 'EXHIBIT A-2',
  137. 'WARRANTIES',
  138. 'EXIT, TERMINATION AND SUSPENSION',
  139. 'EXHIBIT B',
  140. 'EXHIBIT B-1',
  141. 'EXHIBIT B-2',
  142. 'COUNTERPARTS',
  143. 'LICENSE GRANT',
  144. 'INDEMNIFICATION BY CUSTOMER',
  145. 'TERMS OF THE {P1_NAME} PRODUCTS',
  146. 'TERMS OF CLOUD SERVICE',
  147. 'INDEMNIFICATION BY CUSTOMER',
  148. 'TERMINATION',
  149. 'TERMS OF THE {P1_PROS}',
  150. 'SUPPORT',
  151. 'SUB CONTRACTING AND THIRD PARTY RECOMMENDATIONS',
  152. 'LICENCE AND ACCESS TO SOFTWARE AND HARDWARE',
  153. 'DECLARATION OF NON-LIAISON AND ANTI-CORRUPTION COMMITMENT',
  154. '{P1_NAME}\'S DUTIES'
  155. ], CASE_UPPER);
  156. //$breakPoints = [];
  157. if ($this->paragraphBrake($data[ $j ], $breakPoints)) {
  158. break;
  159. }
  160. if (substr(trim(str_replace(array_merge([')'], $childNumbering), '', $data[ $j ][ 'content' ])),
  161. 0, 3) == '<b>' && str_word_count(strip_tags(str_replace(array_merge([')'],
  162. $childNumbering), '',
  163. $data[ $j ][ 'content' ]))) == str_word_count($this->getTextBetweenTags(str_replace(array_merge([')',],
  164. $childNumbering), '', $data[ $j ][ 'content' ]),
  165. 'b')) && (isset($data[ $j + 1 ]) && ((ctype_upper(substr($data[ $j + 1 ][ 'content' ],
  166. 0,
  167. 1)) || (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && $data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ] == 1))))) {
  168. break;
  169. }
  170. if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ctype_upper(str_replace(' ',
  171. '', $data[ $j ][ 'content' ])) && str_word_count($data[ $j ][ 'content' ]) >= 1) {
  172. break;
  173. }
  174. if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ctype_upper(str_replace([
  175. '<b>',
  176. '</b>',
  177. last($childNumbering),
  178. last($childNumbering),
  179. ')',
  180. '.'
  181. ], '', trim(str_replace(' ', '',
  182. $data[ $j ][ 'content' ])))) && str_word_count($data[ $j ][ 'content' ]) >= 1) {
  183. break;
  184. }
  185. //if(isset($data[$j]['numbering']) && isset($data[$i]['numbering']) && )
  186. if (isset($data[ $i ][ 'children' ]) && isset($data[ $i ][ 'numbering' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ]) && isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && ($data[ $j ][ 'numbering' ] - last($data[ $i ][ 'children' ])[ 'numbering' ] !== 1 && $data[ $i ][ 'numbering' ] < $data[ $j ][ 'numbering' ]) && ! in_array(substr(strip_tags(last($data[ $i ][ 'children' ])[ 'content' ]),
  187. strlen(strip_tags(last($data[ $i ][ 'children' ])[ 'content' ])) - 1),
  188. [':', '-']) && ! strpos($data[ $j ][ 'numbering' ], '.')) {
  189. break;
  190. }
  191. if (in_array(strtoupper(trim(str_replace([
  192. '<b>',
  193. '</b>',
  194. last($parentNumbering),
  195. last($parentNumbering),
  196. ')',
  197. '.'
  198. ], '', strip_tags($data[ $i ][ 'content' ])))), $breakPoints)) {
  199. if ((! isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && (substr($data[ $i ][ 'content' ],
  200. 0,
  201. 3) != '<b>') || (str_word_count(strip_tags($data[ $i ][ 'content' ])) != str_word_count($this->getTextBetweenTags($data[ $i ][ 'content' ],
  202. 'b'))))) {
  203. if (! in_array($data[ $i ][ 'content' ], $breakPoints)) {
  204. break;
  205. }
  206. }
  207. }
  208. if (in_array(strtoupper(trim(str_replace([
  209. '<b>',
  210. '</b>',
  211. last($childNumbering),
  212. last($childNumbering),
  213. ')',
  214. '.'
  215. ], '', strip_tags($data[ $j ][ 'content' ])))), $breakPoints)) {
  216. break;
  217. }
  218. if (in_array(substr(strip_tags($data[ $j ][ 'content' ]),
  219. strlen(strip_tags($data[ $j ][ 'content' ])) - 1), [':', '-'])) {
  220. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  221. $alreadyHandled[] = $j;
  222. } elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ctype_lower(substr(last($data[ $i ][ 'children' ])[ 'content' ],
  223. strlen(last($data[ $i ][ 'children' ])[ 'content' ]) - 1)) && ctype_lower(substr(trim($data[ $j ][ 'content' ]),
  224. 0, 1))) {
  225. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  226. $alreadyHandled[] = $j;
  227. } elseif (str_word_count(preg_replace('/[A-Za-z]{4,}/', '',
  228. strip_tags($data[ $j ][ 'content' ]))) < 3 && strlen(strip_tags($data[ $j ][ 'content' ])) && ! isset($data[ $j ][ 'numbering' ]) && ctype_upper(substr($data[ $j ][ 'content' ],
  229. 0, 1)) && str_word_count($data[ $j ][ 'content' ]) < 10) {
  230. if (isset($data[ $i ][ 'children' ]) && ! in_array(substr(trim(last($data[ $i ][ 'children' ])[ 'content' ]),
  231. strlen(trim(last($data[ $i ][ 'children' ])[ 'content' ])) - 1),
  232. ['!', '.', '?', '_', '}'])) {
  233. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  234. $alreadyHandled[] = $j;
  235. } else {
  236. break;
  237. }
  238. //dd($data[$i]);
  239. } elseif (str_word_count(preg_replace('/[A-Za-z]{4,}/', '',
  240. strip_tags($data[ $i ][ 'content' ]))) < 2 && strlen(strip_tags($data[ $i ][ 'content' ]))) {
  241. if (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && is_numeric($data[ $j ][ 'numbering' ]) && abs($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ]) == 1 && str_word_count($data[ $j ]
  242. [ 'content' ]) < 6) {
  243. break;
  244. }
  245. if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ((str_word_count($data[ $j ]
  246. [ 'content' ]) < 6) || (substr_count($data[ $j ][ 'content' ],
  247. '<b>') == 1 && substr_count(last($data[ $i ][ 'children' ])[ 'content' ],
  248. '<b>') == 0 && ! isset(last($data[ $i ][ 'children' ])[ 'numbering' ]))) && ctype_upper((substr($data[ $j ][ 'content' ],
  249. 0, 1)))) {
  250. break;
  251. }
  252. if (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ]) && $data[ $j ][ 'numbering' ] + 1 == $data[ $i ][ 'numbering' ] && str_word_count($data[ $j ][ 'content' ]) < 6) {
  253. break;
  254. }
  255. if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && ! isset($data[ $i ][ 'numbering' ]) && ! isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && isset($data[ $j ][ 'numbering' ])) {
  256. break;
  257. }
  258. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  259. $alreadyHandled[] = $j;
  260. } elseif (! in_array(trim(strtolower(strip_tags($data[ $j ][ 'content' ]))),
  261. ['definitions']) && ! ctype_space($data[ $j ][ 'content' ]) && strlen(trim(strip_tags($data[ $j ][ 'content' ]))) && ! isset($data[ $i ][ 'numbering' ]) && ! isset($data[ $j ][ 'numbering' ])) {
  262. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  263. $alreadyHandled[] = $j;
  264. } elseif (isset($data[ $i ][ 'numbering' ]) && isset($data[ $j ][ 'numbering' ])) {
  265. if (is_numeric($data[ $j ][ 'numbering' ]) && is_numeric($data[ $i ][ 'numbering' ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) == 1 && str_word_count($data[ $j ][ 'content' ]) < str_word_count($data[ $i ][ 'content' ])) {
  266. break;
  267. }
  268. if (is_numeric($data[ $j ][ 'numbering' ]) && abs($data[ $j ][ 'numbering' ] - $data[ $i ][ 'numbering' ]) === 1 && (isset($data[ $i ][ 'children' ]) && (! (isset(last($data[ $i ][ 'children' ])[ 'numbering' ])) || (isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && abs(last($data[ $i ][ 'children' ])[ 'numbering' ] - $data[ $j ][ 'numbering' ]) !== 1))) && str_word_count($data[ $j ][ 'content' ]) < 8) {
  269. break;
  270. }
  271. if (substr_count($data[ $j ][ 'numbering' ], '.') > substr_count($data[ $i ][ 'numbering' ],
  272. '.') && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) < 1) {
  273. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  274. $alreadyHandled[] = $j;
  275. } elseif (((float) $data[ $j ][ 'numbering' ] > (float) $data[ $i ][ 'numbering' ] && substr_count($data[ $j ][ 'content' ],
  276. '<b>') == 0 && substr_count($data[ $i ][ 'content' ],
  277. '<b>') == 1) || (substr_count($data[ $i ][ 'content' ],
  278. "<b>") == 1 && (substr_count($data[ $j ][ 'content' ],
  279. '<b>') == 0 || substr_count($data[ $j ][ 'content' ], '<b>')) > 1)) {
  280. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  281. $alreadyHandled[] = $j;
  282. } elseif (substr_count($data[ $i ][ 'content' ],
  283. '<b>') == 1 && str_word_count($data[ $j ][ 'content' ]) > 6 && isset($data[ $j ][ 'numbering' ])) {
  284. if (strpos($data[ $j ][ 'content' ],
  285. 'Networking infrastructure (hardware, firmware, software an') !== false) {
  286. dd('aa');
  287. }
  288. if (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ])) {
  289. $lastParentChild = last($data[ $i ][ 'children' ]);
  290. if (isset($lastParentChild[ 'numbering' ]) && abs($lastParentChild[ 'numbering' ] - $data[ $j ][ 'numbering' ]) === 1 && (substr_count($data[ $j ][ 'content' ],
  291. '<b>') == 1)) {
  292. break;
  293. }
  294. }
  295. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  296. $alreadyHandled[] = $j;
  297. } elseif (isset($data[ $i ][ 'numbering' ]) && abs($data[ $i ][ 'numbering' ] - $data[ $j ][ 'numbering' ]) === 1 && str_word_count($data[ $j ][ 'content' ]) >= 6) {
  298. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  299. $alreadyHandled[] = $j;
  300. } elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ]) && isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && abs((float) $data[ $j ][ 'numbering' ] - (float) last($data[ $i ][ 'children' ])[ 'numbering' ]) == (float) 1) {
  301. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  302. $alreadyHandled[] = $j;
  303. } elseif (isset($data[ $i ][ 'numbering' ]) && abs($data[ $i ][ 'numbering' ] - $data[ $j ][ 'numbering' ]) == 0 && str_word_count($data[ $j ][ 'content' ]) >= 6) {
  304. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  305. $alreadyHandled[] = $j;
  306. } else {
  307. break;
  308. }
  309. } elseif (isset($data[ $i ][ 'numbering' ]) && ! isset($data[ $j ][ 'numbering' ]) && str_word_count($data[ $j ][ 'content' ]) > 6) {
  310. if (substr_count($data[ $j ][ 'content' ],
  311. "<b>") == 1 && strpos(strtolower($data[ $i ][ 'content' ]),
  312. 'definition') === false) {
  313. break;
  314. }
  315. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  316. $alreadyHandled[] = $j;
  317. } elseif (empty($data[ $j ][ 'content' ]) && (isset($data[ $j + 1 ]) && isset($data[ $j - 1 ]) && isset($data[ $i ][ 'children' ]))) {
  318. if (isset(last($data[ $i ][ 'children' ])[ 'numbering' ]) && strlen(last($data[ $i ][ 'children' ])[ 'numbering' ]) == strlen(preg_replace('/[^0-9\.)]/',
  319. '', substr(trim(preg_replace('/ +/', ' ', preg_replace('/[^A-Za-z0-9 .]/', ' ',
  320. urldecode(strip_tags($data[ $j + 1 ][ 'content' ]))))), 0,
  321. 5))) && ! empty($data[ $j ][ 'content' ])) {
  322. dd('Here', $data[ $i ], $data[ $j ]);
  323. $alreadyHandled[] = $j;
  324. } else {
  325. break;
  326. }
  327. } elseif (isset($data[ $i ][ 'children' ]) && count($data[ $i ][ 'children' ]) && isset($data[ $j ][ 'numbering' ])) {
  328. $lastParentChild = last($data[ $i ][ 'children' ]);
  329. if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && substr_count($lastParentChild[ 'numbering' ],
  330. '.') > substr_count($data[ $j ][ 'numbering' ], '.')) {
  331. dd('111');
  332. } else {
  333. $data[ $i ] = $this->handlePossibleChild($data[ $i ], $data[ $j ]);
  334. $alreadyHandled[] = $j;
  335. }
  336. } else {
  337. break;
  338. }
  339. //if(strpos($data[$i]['content'],'<b>2. TERMS OF THE {P1_Pros}.</b>')!==false || strpos($data[$j]['content'],'<b>2. TERMS OF THE {P1_Pros}.</b>')!==false){
  340. // dd($data[$i],$data[$j]);
  341. //}
  342. }
  343. }
  344. }
  345. if (strlen(trim(strip_tags($data[ $i ][ 'content' ])))) {
  346. $response[] = $data[ $i ];
  347. //if ($data[ $i ][ 'content' ] == "Duration of Agreement and Supply") {
  348. // dd(121,$data[$i],$i);
  349. //}
  350. //if($i > 73){
  351. // dd($i,$data[$i],$response);
  352. //}
  353. }
  354. $alreadyHandled[] = $i;
  355. }
  356. return $response;
  357. }
  358. private function handlePossibleChild($parent, $child)
  359. {
  360. if (empty($parent[ 'content' ]) && ! empty($child[ 'content' ])) {
  361. return $child;
  362. }
  363. if (empty($child[ 'content' ])) {
  364. return $parent;
  365. }
  366. // Must iterate through parent children
  367. if (! isset($parent[ 'children' ]) || (isset($parent[ 'children' ]) && count($parent[ 'children' ]) == 0)) {
  368. $parent[ 'children' ] = [];
  369. if (str_word_count(strip_tags($child[ 'content' ])) >= 5 && strpos($child[ 'content' ], '<b>') === false) {
  370. $parent[ 'children' ][] = $child;
  371. } elseif (strpos($parent[ 'content' ], '<b>') !== false && strpos($child[ 'content' ], '<b>') !== false) {
  372. $parent[ 'children' ][] = $child;
  373. } elseif (isset($child[ 'content' ])) {
  374. $parent[ 'children' ][] = $child;
  375. }
  376. return $parent;
  377. }
  378. $lastParentChild = last($parent[ 'children' ]);
  379. if ($lastParentChild && substr($lastParentChild[ 'content' ],
  380. strlen($lastParentChild[ 'content' ]) - 1) === ':' && ((ctype_lower(substr($child[ 'content' ], 0,
  381. 1)) || (ctype_digit(substr($child[ 'content' ], 0,
  382. 1)) && str_word_count($child[ 'content' ]) > 5)))) {
  383. $lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
  384. if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && $child[ 'numbering' ] - 1 == $lastParentChild[ 'numbering' ]) {
  385. $parent[ 'children' ][] = $child;
  386. } else {
  387. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  388. }
  389. return $parent;
  390. }
  391. if (isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && strlen($child[ 'numbering' ]) > strlen($lastParentChild[ 'numbering' ])) {
  392. if (isset($parent[ 'children' ]) && isset(last($parent[ 'children' ])[ 'numbering' ]) && $child[ 'numbering' ]) {
  393. if (is_numeric($child[ 'numbering' ]) && abs($child[ 'numbering' ] - $lastParentChild[ 'numbering' ]) === 1) {
  394. $parent[ 'children' ][] = $child;
  395. return $parent;
  396. }
  397. }
  398. if (isset($child[ 'numbering' ]) && isset($lastParentChild[ 'numbering' ]) && substr_count($lastParentChild[ 'numbering' ],
  399. '.') == substr_count($child[ 'numbering' ], '.')) {
  400. $parent[ 'children' ][] = $child;
  401. return $parent;
  402. }
  403. $lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
  404. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  405. return $parent;
  406. }
  407. if (! in_array(substr(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ])),
  408. strlen(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ]))) - 1),
  409. ['!', '.', '?', ';', '_', ':']) && (ctype_lower(substr(trim($child[ 'content' ]), 0,
  410. 1)) || ((ctype_upper(substr(trim($child[ 'content' ]), 0,
  411. 1)) && ! isset($child[ 'numbering' ]))))) {
  412. //dd($lastParentChild,$child);
  413. if (strpos($lastParentChild[ 'content' ],
  414. 'e, this Agreement and the {P1_Name} Software Licence Agreement') !== false) {
  415. dd('aa', $lastParentChild, $child);
  416. }
  417. $lastParentChild[ 'content' ] .= ' '.$child[ 'content' ];
  418. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  419. return $parent;
  420. } elseif (! in_array(substr(trim($parent[ 'content' ]), strlen(trim($parent[ 'content' ])) - 1),
  421. ['!', '.', '?', ';']) && ctype_lower(substr(trim($lastParentChild[ 'content' ]),
  422. strlen(trim($lastParentChild[ 'content' ])) - 1)) && ctype_lower(substr(trim($child[ 'content' ]), 0,
  423. 1))) {
  424. $parent[ 'children' ][] = $child;
  425. } elseif (! in_array(substr(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ])),
  426. strlen(trim(str_replace(['and', 'or'], '', $lastParentChild[ 'content' ]))) - 1), [
  427. '!',
  428. '.',
  429. '?',
  430. ';',
  431. '_',
  432. ':'
  433. ]) && isset($lastParentChild[ 'numbering' ]) && isset($child[ 'numbering' ]) && $lastParentChild[ 'numbering' ] > $child[ 'numbering' ]) {
  434. $lastParentChild[ 'children' ][] = $child;
  435. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  436. } else {
  437. $parent[ 'children' ][] = $child;
  438. }
  439. return $parent;
  440. }
  441. public function handleTestHtml($array)
  442. {
  443. $data = [];
  444. foreach ($array as $item) {
  445. if (count($item) == 1 && is_array(last($item))) {
  446. return $this->handleTestHtml($item);
  447. } else {
  448. $html = $this->buildParagraphs($item);
  449. if (! isset($data[ 'content' ]) && count($html) > 1) {
  450. $data = array_merge($data, $html);
  451. } elseif ($html) {
  452. $data = $html;
  453. }
  454. }
  455. }
  456. return $data;
  457. }
  458. private function buildParagraphs($paragraphs)
  459. {
  460. $result = [];
  461. $alreadyHandled = [];
  462. for ($i = 0; $i < count($paragraphs); $i++) {
  463. if (array_key_exists($i, $alreadyHandled)) {
  464. continue;
  465. }
  466. $paragraph = $paragraphs[ $i ];
  467. if (is_array($paragraph)) {
  468. $result = array_merge($result, $this->buildParagraphs($paragraph));
  469. } elseif (strlen($paragraph) && ! ctype_space($paragraph)) {
  470. $cleanHtml = trim(str_replace('<b> </b>', '',
  471. preg_replace('/<([^>\s]+)[^>]*>(?:\s*(?:<br \/>|&nbsp;|&thinsp;|&ensp;|&emsp;|&#8201;|&#8194;|&#8195;)\s*)*<\/\1>/',
  472. '', preg_replace('/(<font[^>]*>)|(<\/font>)/', '', preg_replace('/\s+/S', " ", $paragraph)))));
  473. if (! empty($cleanHtml)) {
  474. $result[] = ['content' => html_entity_decode($cleanHtml, ENT_COMPAT | ENT_HTML401, 'UTF-8')];
  475. }
  476. }
  477. }
  478. return $result;
  479. }
  480. /*
  481. * Get text between html tag
  482. */
  483. private function getTextBetweenTags($string, $tagname)
  484. {
  485. $pattern = "/<$tagname ?.*>(.*)<\/$tagname>/";
  486. preg_match($pattern, str_replace(['<u>', '</u>'], '', $string), $matches);
  487. if ($matches) {
  488. return last($matches);
  489. }
  490. return '';
  491. }
  492. private function paragraphBrake($paragraph, array $breakPoints)
  493. {
  494. //$paragraph[ 'content' ] = '2) <b>TERMS OF THE {P1_Pros}.</b> Subject to the terms of the Agreement, {P1_Name} grants Customer and/or its Affiliates a non-exclusive, non-transferable (except to a successor in interest as permitted hereunder) license to use the {P1_Pros} listed on the <u>Order Form</u> during the Term. Customer\’s and/or its Affiliates\’ right to use the {P1_Pros} is limited to the volume and other restrictions contained herein and in the Order Form and the Documentation.';
  495. //$paragraph[ 'numbering' ] = '2';
  496. preg_replace('/<b ?.*>(\d+)<\/b>/', $paragraph[ 'content' ], $paragraph[ 'content' ]);
  497. preg_replace('/(\d+)\)/', $paragraph[ 'content' ], $paragraph[ 'content' ]);
  498. if (isset($paragraph[ 'numbering' ])) {
  499. $paragraph[ 'content' ] = str_replace(['.', ')', $paragraph[ 'numbering' ]], '', $paragraph[ 'content' ]);
  500. }
  501. if (substr_count($paragraph[ 'content' ], '</b>') === 1) {
  502. $breakString = explode('</b>', $paragraph[ 'content' ]);
  503. if ($breakString) {
  504. $breakString = trim(str_replace('<b>', '', trim($breakString[ 0 ])));
  505. if (in_array($breakString, $breakPoints)) {
  506. return true;
  507. }
  508. }
  509. }
  510. return false;
  511. }
  512. }