Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

406 lines
14 KiB

  1. <?php
  2. namespace App\Parser;
  3. use Illuminate\Support\Facades\Log;
  4. use Illuminate\Support\Facades\Storage;
  5. use SimpleXMLElement;
  6. class ParseXml
  7. {
  8. /**
  9. * @var int
  10. */
  11. private $titleFontThreshold;
  12. /**
  13. * @var int
  14. */
  15. private $headerFontFooterThreshold;
  16. /**
  17. * ParseXml constructor.
  18. */
  19. public function __construct()
  20. {
  21. $this->headerFontFooterThreshold = null;
  22. $this->titleFontThreshold = null;
  23. }
  24. /**
  25. * Handle xml files
  26. *
  27. * @param $xmlFile
  28. *
  29. * @return mixed
  30. */
  31. public function handle($xmlFile)
  32. {
  33. if (is_string($xmlFile)) {
  34. try {
  35. $storageDisk = Storage::disk('contracts');
  36. while (! $storageDisk->exists($xmlFile)) {
  37. //Sleep if file not yet written
  38. sleep(1);
  39. }
  40. $file = $storageDisk->get($xmlFile);
  41. } catch (\Exception $exception) {
  42. Log::error('Failed to load the xml file '.$exception->getMessage());
  43. }
  44. } else {
  45. $file = file_get_contents($xmlFile);
  46. }
  47. //foreach (simplexml_load_string($file) as $key =>$xmlElementPage){
  48. // dd($xmlElementPage);
  49. //}
  50. return $this->buildChildStructure($this->handleElements(simplexml_load_string($file)->xpath('//text')));
  51. }
  52. /**
  53. * @param $element
  54. *
  55. * @return mixed
  56. */
  57. private function handleElements($element)
  58. {
  59. if (is_array($element)) {
  60. $elements = $element;
  61. } else {
  62. $elements = (array) $element;
  63. }
  64. //dd(!in_array(trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))),['and','or']),trim(last(explode(' ', strip_tags('modify or make additions to the {P1_Name} Software, except to the extent permitted by law; or')))));
  65. $this->setTitleThreshold($elements);
  66. $numberOfNodes = count($elements);
  67. $rows = [];
  68. for ($i = 0; $i < $numberOfNodes; $i++) {
  69. $current = $elements[ $i ];
  70. $listContent = [];
  71. if ($current instanceof SimpleXMLElement) {
  72. $content = $this->getNodeContent($current);
  73. //if(strpos($content,'Provided that the Customer has continued to pay ')!==false){
  74. // dd(($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) && (((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
  75. // || (isset($elements[ $i + 1 ]) && ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))), substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1))));
  76. //}
  77. $parentNumbering = [];
  78. while ($i + 1 <= $numberOfNodes && isset($elements[ $i + 1 ]) &&
  79. (((((((int) $elements[ $i + 1 ][ 'top' ] === (int) $current[ 'top' ]) || (int) $elements[ $i + 1 ][ 'top' ] <= ((int) $current[ 'top' ] + (int) $current[ 'height' ] + 3)) && (int) $current[ 'top' ] <= (int) $elements[ $i + 1 ][ 'top' ])
  80. || (ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0,1)))
  81. || (! in_array(substr(trim(strip_tags($this->getNodeContent($elements[ $i + 1 ]))),0, 1), [',']))
  82. || (ctype_lower(substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1))))
  83. && ! in_array(substr(trim(str_replace(['and','or'], '', $content)), strlen(trim(str_replace(['and', 'or'], '', $content))) - 1),['!', '.', '?', ';', '_', ':', ')'])
  84. && ! preg_match('/^.*?\-[^\d]*(\d+)[^\d]*\-.*$/',$content)
  85. && (substr(trim($this->getNodeContent($elements[ $i + 1 ])), 0,strlen('<b>')) !== '<b>'
  86. && ctype_lower((substr(trim(strip_tags($content)),strlen(trim(strip_tags($content))) - 1)))))
  87. || ((int) $elements[ $i ][ 'top' ] === (int) $elements[ $i + 1 ][ 'top' ]))
  88. || (isset($elements[ $i + 1 ]) && trim(strip_tags($this->getNodeContent($elements[ $i+1])))=='[')
  89. ) {
  90. //if($parentNumbering){
  91. // dd($parentNumbering,$content);
  92. //}
  93. preg_match('/^([-+]?\d*\.?\d+)(?:[-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/',
  94. preg_replace('/[^0-9\.)]/', '', substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
  95. preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))),
  96. 0, 5)), $childNumbering);
  97. if (! $childNumbering) {
  98. preg_match('/^([-+]?\d*\.?\d+)(?:[eE]([-+]?\d+))?/', preg_replace('/[^0-9\.)]/', '',
  99. substr(trim(preg_replace('/[^A-Za-z0-9.)]/', '',
  100. preg_replace('/\)/', '.', preg_replace("/\{.+/", "", html_entity_decode($content))))),
  101. 0, 5)), $parentNumbering);
  102. }
  103. //if($childNumbering && strpos($childNumbering[0],"2.1.5")!==false){
  104. // dd(11,$content,$elements[$i],$i,$i+1);
  105. //}
  106. $nextElement = $elements[ $i + 1 ];
  107. $nextElementContent = $this->getNodeContent($nextElement);
  108. $content .= ' '.$nextElementContent;
  109. $current[ 'top' ] = $nextElement[ 'top' ];
  110. $current[ 'height' ] = $nextElement[ 'height' ];
  111. if (count($parentNumbering)) {
  112. $current[ 'row_numbering' ] = $parentNumbering[ 0 ];
  113. $content = str_replace($current[ 'row_numbering' ], '', $content);
  114. $i++;
  115. break;
  116. } elseif ($childNumbering) {
  117. $current[ 'row_numbering' ] = $childNumbering[ 0 ];
  118. $content = str_replace($current[ 'row_numbering' ], '', $content);
  119. if (strlen(trim(strip_tags($content))) && ! in_array(substr(trim(strip_tags($content)),
  120. strlen(trim(strip_tags($content))) - 1),
  121. ['.', ':', '!', '?','[',',']) && !ctype_lower(substr(trim(strip_tags($content)),
  122. strlen(trim(strip_tags($content)))-1)) && (!ctype_lower(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))),
  123. 0, 1)) || !in_array(substr(trim(strip_tags($this->getNodeContent($elements[$i+1]))), 0, 1),
  124. ['[', '{']))) {
  125. $i++;
  126. break;
  127. }
  128. }
  129. if( ! empty($current[ 'row_numbering' ]) && ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/",
  130. "", strip_tags($this->getNodeContent($elements[$i])))))){
  131. $i++;
  132. break;
  133. }
  134. //$current[ 'font' ] = $nextElement[ 'font' ];
  135. $i++;
  136. continue;
  137. }
  138. $data = $this->extractNumbering($content);
  139. $content = [
  140. 'type' => (int) $current[ 'font' ] === $this->titleFontThreshold ? 'title' : null,
  141. 'content' => $data[ 'content' ],
  142. 'numbering' => (! empty($current[ 'row_numbering' ])) ? (int)$current[ 'row_numbering' ] : $data[ 'numbering' ],
  143. 'top' => (int) $current[ 'top' ],
  144. 'height' => (int) $current[ 'height' ],
  145. 'left' => (int) $current[ 'left' ],
  146. 'font' => (int) $current[ 'font' ],
  147. 'children' => $listContent
  148. ];
  149. $rows[] = $content;
  150. }
  151. }
  152. return $rows;
  153. }
  154. /**
  155. * Returns the xml node content
  156. *
  157. * @param $node
  158. *
  159. * @return string|string[]|null
  160. */
  161. private function getNodeContent($node)
  162. {
  163. return preg_replace('!\s+!', ' ', preg_match_all("/<text.*?>(.*?)<\/text>/", $node->asXML(),
  164. $matches) ? $matches[ 1 ] ? $matches[ 1 ][ 0 ] : '' : '');
  165. }
  166. /**
  167. * Extract the numbering if exists from the string
  168. *
  169. * @param $content
  170. *
  171. * @return array
  172. */
  173. private function extractNumbering($content)
  174. {
  175. $regexOne = '/^(([a-zA-Z0-9]+[.\)])+)([ ]|[a-z]|[A-Z])/';
  176. $regexTwo = '/^(([\d\.]+)\d)/';
  177. if (preg_match($regexOne, $content, $n)) {
  178. $numbering = trim(last($n));
  179. } else {
  180. if (preg_match($regexTwo, $content, $n)) {
  181. $numbering = trim(last($n));
  182. } else {
  183. $numbering = '';
  184. }
  185. }
  186. if (strlen($numbering) > 1) {
  187. return [
  188. 'content' => '<p>'.trim(str_replace($numbering, '', $content)).'</p>',
  189. 'numbering' => $numbering
  190. ];
  191. }
  192. return [
  193. 'content' => '<p>'.trim($content).'</p>',
  194. 'numbering' => ''
  195. ];
  196. }
  197. /**
  198. * Build the structure as required by the editor and the gamification module
  199. *
  200. * @param $elements
  201. *
  202. * @return array
  203. */
  204. private function buildChildStructure($elements)
  205. {
  206. $alreadyHandledIndexes = [];
  207. $build = [];
  208. // 0 1 2 3 4 5 6
  209. // 1 1.1 1.1.1 1.2 1.2.1 1.3 1.3.1 2 3 4 4.1 4.2 5 6
  210. for ($i = 0; $i < count($elements) - 1; $i++) {
  211. if (! isset($elements[ $i ][ 'type' ])) {
  212. if ($elements[ $i ][ 'top' ] < 100) {
  213. $elements[ $i ][ 'type' ] = 'header';
  214. } elseif ($elements[ $i ][ 'top' ] > 1150) {
  215. $elements[ $i ][ 'type' ] = 'footer';
  216. }
  217. }
  218. if (in_array($i, $alreadyHandledIndexes)) {
  219. continue;
  220. }
  221. if (isset($elements[ $i ][ 'type' ]) && in_array($elements[ $i ][ 'type' ], ['footer', 'header'])) {
  222. continue;
  223. }
  224. for ($j = $i + 1; $j < count($elements); $j++) {
  225. if (! isset($elements[ $j ][ 'type' ])) {
  226. if ($elements[ $j ][ 'top' ] < 100) {
  227. $elements[ $j ][ 'type' ] = 'header';
  228. } elseif ($elements[ $j ][ 'top' ] > 1150) {
  229. $elements[ $j ][ 'type' ] = 'footer';
  230. }
  231. }
  232. if (in_array($j, $alreadyHandledIndexes)) {
  233. continue;
  234. }
  235. if (isset($elements[ $j ][ 'type' ]) && in_array($elements[ $j ][ 'type' ], ['footer', 'header'])) {
  236. continue;
  237. }
  238. if ($elements[ $j ][ 'type' ] === 'title' && $elements[ $i ][ 'top' ] !== $elements[ $j ][ 'top' ] && ! ctype_digit(trim(preg_replace("/[^0-9a-zA-Z]/",
  239. "", strip_tags($elements[ $i ][ 'content' ]))))) {
  240. break;
  241. }
  242. if ($elements[ $i ][ 'left' ] < $elements[ $j ][ 'left' ] || ($elements[ $i ][ 'type' ] == 'title' && is_null($elements[ $j ][ 'type' ]))) {
  243. $elements[ $i ] = $this->handlePossibleChild($elements[ $i ], $elements[ $j ]);
  244. $alreadyHandledIndexes[] = $j;
  245. } else {
  246. break;
  247. }
  248. }
  249. if (! in_array($elements[ $i ][ 'type' ], ['header', 'footer'])) {
  250. $build[] = $elements[ $i ];
  251. }
  252. $alreadyHandledIndexes[] = $i;
  253. }
  254. return $build;
  255. }
  256. /**
  257. * Handle each node child's
  258. *
  259. * @param $parent
  260. * @param $child
  261. *
  262. * @return mixed
  263. */
  264. protected function handlePossibleChild($parent, $child)
  265. {
  266. // 1
  267. // 1.1
  268. // 1.1.1
  269. // 2
  270. // Must iterate through parent children
  271. if (count($parent[ 'children' ]) === 0) {
  272. $parent[ 'children' ][] = $child;
  273. return $parent;
  274. }
  275. $lastParentChild = last($parent[ 'children' ]);
  276. // Possible to be either child or grandchild
  277. if ($child[ 'left' ] > $lastParentChild[ 'left' ]) {
  278. $lastParentChild = $this->handlePossibleChild($lastParentChild, $child);
  279. } elseif ($child[ 'left' ] === $parent[ 'left' ] && $parent[ 'type' ] == 'title' && is_null($child[ 'type' ])) {
  280. $parent[ 'children' ][] = $child;
  281. return $parent;
  282. } else {
  283. if ($child[ 'left' ] === $lastParentChild[ 'left' ]) {
  284. $parent[ 'children' ][] = $child;
  285. return $parent;
  286. }
  287. }
  288. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  289. return $parent;
  290. }
  291. /**
  292. * Set's the title threshold
  293. *
  294. * @param $elements
  295. */
  296. protected function setTitleThreshold($elements)
  297. {
  298. $nextElement = null;
  299. foreach ($elements as $index => $element) {
  300. if ($index + 1 < count($elements) && ! isset($this->titleFontThreshold)) {
  301. $nextElement = $elements[ $index + 1 ];
  302. if ((isset($current->b) || $index == 0 || (! is_null($nextElement) && (int) $element[ 'font' ] < (int) $nextElement[ 'font' ]))) {
  303. $this->titleFontThreshold = (int) $element[ 'font' ];
  304. }
  305. } else {
  306. continue;
  307. }
  308. }
  309. }
  310. /**
  311. * Set's the header and footer threshold
  312. *
  313. * @param $elements
  314. */
  315. protected function setHeaderFooterThreshold($elements)
  316. {
  317. foreach ($elements as $index => $element) {
  318. if (isset($elements[ $index + 1 ]) && ! isset($this->headerFontFooterThreshold)) {
  319. $nextElement = $elements[ $index + 1 ];
  320. if (! isset($nextElement[ 'type' ]) && $element[ 'top' ] > $nextElement[ 'top' ]) {
  321. $this->headerFontFooterThreshold = $nextElement[ 'font' ];
  322. }
  323. } else {
  324. continue;
  325. }
  326. }
  327. }
  328. }