Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

269 lines
10 KiB

  1. <?php
  2. namespace App\Parser\DocxParser;
  3. use App\Parser\DocxParser\Traits\Helper;
  4. use Illuminate\Support\Facades\Log;
  5. use PhpOffice\PhpWord\IOFactory;
  6. use function GuzzleHttp\Psr7\str;
  7. class ParseDocx
  8. {
  9. use Helper;
  10. protected $currentNumberingIndex = 1;
  11. public function fromUploadedFile($file)
  12. {
  13. try {
  14. $docxFileLoader = IOFactory::load($file);
  15. Log::info('Parse docx');
  16. return $this->parseLoadedDocx($docxFileLoader);
  17. } catch (\Exception $exception) {
  18. dd($exception);
  19. throw new \Exception($exception->getMessage());
  20. }
  21. }
  22. private function parseLoadedDocx($docx)
  23. {
  24. $styles = 0;
  25. foreach ($docx->getSections() as $page) {
  26. $handler = $this->getHandler($page);
  27. $paragraphs = $handler->handle($page);
  28. if ($paragraphs) {
  29. foreach ($paragraphs as $index => $paragraph) {
  30. try {
  31. if ($paragraph && $paragraph[ 'type' ] !== 'textBreak' && (isset($paragraph[ 'content' ][ 'type' ]) && $paragraph[ 'content' ][ 'type' ] !== 'textBreak') || $paragraph[ 'type' ] == 'table') {
  32. $result[] = $paragraph;
  33. if (isset($paragraph[ 'styleName' ])) {
  34. $styles++;
  35. }
  36. }
  37. } catch (\Exception $e) {
  38. dd($e);
  39. }
  40. }
  41. }
  42. }
  43. $depthTypeType = count($result) / 2 <= $styles ? 'styleDepth' : 'depth';
  44. return $this->setTheNumbering($result, null, $depthTypeType);
  45. }
  46. private function setTheNumbering($paragraphs, $parentNumbering = null, $depthType = 'depth')
  47. {
  48. $result = [];
  49. $paragraphs = $this->buildTheChildrens($paragraphs, $depthType);
  50. for ($index = 0; $index < count($paragraphs); $index++) {
  51. $paragraph = $paragraphs[ $index ];
  52. try {
  53. if ($paragraph[ 'type' ] !== 'table' && ($paragraph[ $depthType ] === 0 || $parentNumbering) && strpos($paragraph[ 'styleName' ],
  54. 'BodyText') === false) {
  55. $paragraph[ 'content' ][ 'numbering' ] = ($parentNumbering) ? $parentNumbering.((int) $index + 1).'.' : $this->currentNumberingIndex.'.';
  56. $paragraph[ 'content' ][ 'numbering_row' ] = ($parentNumbering) ? ((int) $index + 1) : $this->currentNumberingIndex;
  57. if ($paragraph[ 'children' ] && count($paragraph[ 'children' ])) {
  58. $paragraph[ 'children' ] = $this->setTheNumbering($paragraph[ 'children' ],
  59. $paragraph[ 'content' ][ 'numbering' ], $depthType);
  60. }
  61. if (! $parentNumbering) {
  62. $this->currentNumberingIndex++;
  63. }
  64. } elseif (isset($paragraph[ 'content' ][ 'numbering' ]) && isset($paragraph[ 'children' ]) && count($paragraph[ 'children' ])) {
  65. $paragraphs[ $index ] = $this->setChildrenNumbering($paragraphs[ $index ]);
  66. } elseif (isset($paragraphs[ $index ][ 'content' ][ 'numbering' ]) && isset(last($result)[ 'content' ][ 'numbering' ]) && $paragraphs[ $index ][ 'content' ][ 'numbering' ] == last($result)[ 'content' ][ 'numbering' ]) {
  67. }
  68. } catch (\Exception $e) {
  69. dd($e);
  70. }
  71. $result[] = $paragraphs[ $index ];
  72. }
  73. return $result;
  74. }
  75. /**
  76. * @param $parent
  77. *
  78. * @return mixed
  79. */
  80. private function setChildrenNumbering($parent)
  81. {
  82. $numbering = 1;
  83. for ($j = 0; $j < count($parent[ 'children' ]); $j++) {
  84. $children = $parent[ 'children' ][ $j ];
  85. if ($children[ 'type' ] == 'listItemRun' || isset($children[ 'content' ][ 'numbering' ])) {
  86. $parentNumber = $parent[ 'content' ][ 'numbering' ];
  87. $parent[ 'children' ][ $j ][ 'content' ][ 'numbering' ] = (substr(trim($parentNumber),
  88. strlen(trim($parentNumber)) - 1) == '.') ? $parentNumber.$numbering : $parentNumber.'.'.$numbering;
  89. if (count($parent[ 'children' ][ $j ][ 'children' ])) {
  90. $parent[ 'children' ][ $j ] = $this->setChildrenNumbering($parent[ 'children' ][ $j ]);
  91. }
  92. $numbering++;
  93. }
  94. }
  95. return $parent;
  96. }
  97. /**
  98. * @param $paragraphs
  99. *
  100. * @return array
  101. */
  102. private function buildTheChildrens($paragraphs, $depthType)
  103. {
  104. $alreadyHandledIndexes = [];
  105. $result = [];
  106. for ($i = 0; $i < count($paragraphs); $i++) {
  107. if (in_array($i, $alreadyHandledIndexes)) {
  108. continue;
  109. }
  110. $j = $i + 1;
  111. for ($j; $j < count($paragraphs); $j++) {
  112. if (in_array($j, $alreadyHandledIndexes)) {
  113. continue;
  114. }
  115. if (isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && $paragraphs[ $j ][ 'content' ][ 'content' ] === '<p></p>') {
  116. $alreadyHandledIndexes[] = $j;
  117. $j++;
  118. }
  119. if (isset($paragraphs[ $i ][ $depthType ]) && isset($paragraphs[ $j ][ $depthType ]) && $paragraphs[ $i ][ $depthType ] !== null && $paragraphs[ $j ][ $depthType ] !== null && $paragraphs[ $i ][ $depthType ] < $paragraphs[ $j ][ $depthType ]) {
  120. $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
  121. $depthType);
  122. } elseif (isset($paragraphs[ $j ][ 'styleName' ]) && $paragraphs[ $j ][ 'styleName' ] === 'ListParagraph' && $paragraphs[ $i ][ $depthType ] === null && substr(strip_tags($paragraphs[ $i ][ 'content' ][ 'content' ]),
  123. -1) === ':') {
  124. $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
  125. $depthType);
  126. } elseif (isset($paragraphs[ $j + 1 ]) && isset($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]) && isset($paragraphs[ $j ]) && isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && substr(strip_tags($paragraphs[ $j ][ 'content' ][ 'content' ]),
  127. -1) === ':' && (isset($paragraphs[ $j + 1 ]) && ctype_lower(substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])),
  128. 0,
  129. 1)) || (isset($paragraphs[ $j + 1 ]) && substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])),
  130. strlen(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]))) - 1) == ';'))) {
  131. $k = $j + 1;
  132. $alreadyHandledIndexes[] = $k;
  133. while (isset($paragraphs[ $k ]) && substr(str_replace('and', '',
  134. trim(strip_tags(str_replace('and', '', $paragraphs[ $k ][ 'content' ][ 'content' ])))),
  135. strlen(str_replace('and', '', trim(strip_tags(str_replace('and', '',
  136. $paragraphs[ $k ][ 'content' ][ 'content' ]))))) - 1) == ';') {
  137. $paragraphs[ $j ][ 'children' ][] = $paragraphs[ $k ];
  138. $alreadyHandledIndexes[] = $k++;
  139. }
  140. $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
  141. $depthType);
  142. } elseif (isset($paragraphs[ $i ][ 'styleName' ]) && $paragraphs[ $i ][ $depthType ] !== $paragraphs[ $j ][ $depthType ] && strpos($paragraphs[ $i ][ 'styleName' ],
  143. 'Heading2') !== false && ((isset($paragraphs[ $j ][ 'depth' ]) || ($paragraphs[ $j ][ 'type' ] == 'textRun' && isset($paragraphs[ $j ][ 'content' ][ 'numbering' ])) && is_null($paragraphs[ $j ][ 'styleName' ])))) {
  144. $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
  145. $depthType);
  146. } else {
  147. break;
  148. }
  149. $alreadyHandledIndexes[] = $j;
  150. }
  151. $result[] = $paragraphs[ $i ];
  152. $alreadyHandledIndexes[] = $i;
  153. }
  154. return $result;
  155. }
  156. /**
  157. * @param $parent
  158. * @param $child
  159. * @param $i
  160. *
  161. * @return mixed
  162. */
  163. private function handlePossibleChild($parent, $child, $i, $depthType)
  164. {
  165. // Must iterate through parent children
  166. if (isset($parent[ 'children' ]) && count($parent[ 'children' ]) === 0) {
  167. if ($parent[ $depthType ] < $child[ $depthType ] || $parent[ $depthType ] === null) {
  168. $parent[ 'children' ][] = $child;
  169. } elseif (strpos($parent[ 'styleName' ],
  170. 'Heading') !== false && isset($child[ 'content' ][ 'numbering' ]) && substr_count($child[ 'content' ][ 'numbering' ],
  171. '.') == 1) {
  172. $parent[ 'children' ][] = $child;
  173. } else {
  174. return $parent;
  175. }
  176. return $parent;
  177. }
  178. $lastParentChild = last($parent[ 'children' ]);
  179. // Possible to be either child or grandchild
  180. if ($lastParentChild[ $depthType ] && $child[ $depthType ] > $lastParentChild[ $depthType ]) {
  181. $lastParentChild = $this->handlePossibleChild($lastParentChild, $child, $i, $depthType);
  182. } else {
  183. if ($child[ $depthType ] === $lastParentChild[ $depthType ]) {
  184. $parent[ 'children' ][] = $child;
  185. return $parent;
  186. }
  187. if (((isset($lastParentChild[ 'styleDepth' ]) && $lastParentChild[ 'styleDepth' ] === $child[ 'depth' ])) && $lastParentChild[ 'index' ] !== $child[ 'index' ]) {
  188. $parent[ 'children' ][] = $child;
  189. return $parent;
  190. }
  191. }
  192. $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
  193. return $parent;
  194. }
  195. }