|
|
<?php
namespace App\Parser\DocxParser;
use App\Parser\DocxParser\Traits\Helper; use Illuminate\Support\Facades\Log; use PhpOffice\PhpWord\IOFactory; use function GuzzleHttp\Psr7\str;
class ParseDocx {
use Helper;
protected $currentNumberingIndex = 1;
public function fromUploadedFile($file) { try { $docxFileLoader = IOFactory::load($file); Log::info('Parse docx');
return $this->parseLoadedDocx($docxFileLoader); } catch (\Exception $exception) { dd($exception); throw new \Exception($exception->getMessage()); }
}
private function parseLoadedDocx($docx) { $styles = 0; foreach ($docx->getSections() as $page) {
$handler = $this->getHandler($page); $paragraphs = $handler->handle($page); if ($paragraphs) { foreach ($paragraphs as $index => $paragraph) { try { if ($paragraph && $paragraph[ 'type' ] !== 'textBreak' && (isset($paragraph[ 'content' ][ 'type' ]) && $paragraph[ 'content' ][ 'type' ] !== 'textBreak') || $paragraph[ 'type' ] == 'table') { $result[] = $paragraph; if (isset($paragraph[ 'styleName' ])) { $styles++; } } } catch (\Exception $e) { dd($e); }
} } } $depthTypeType = count($result) / 2 <= $styles ? 'styleDepth' : 'depth';
return $this->setTheNumbering($result, null, $depthTypeType); }
private function setTheNumbering($paragraphs, $parentNumbering = null, $depthType = 'depth') { $result = []; $paragraphs = $this->buildTheChildrens($paragraphs, $depthType); for ($index = 0; $index < count($paragraphs); $index++) { $paragraph = $paragraphs[ $index ]; try { if ($paragraph[ 'type' ] !== 'table' && ($paragraph[ $depthType ] === 0 || $parentNumbering) && strpos($paragraph[ 'styleName' ], 'BodyText') === false) {
$paragraph[ 'content' ][ 'numbering' ] = ($parentNumbering) ? $parentNumbering.((int) $index + 1).'.' : $this->currentNumberingIndex.'.'; $paragraph[ 'content' ][ 'numbering_row' ] = ($parentNumbering) ? ((int) $index + 1) : $this->currentNumberingIndex;
if ($paragraph[ 'children' ] && count($paragraph[ 'children' ])) { $paragraph[ 'children' ] = $this->setTheNumbering($paragraph[ 'children' ], $paragraph[ 'content' ][ 'numbering' ], $depthType);
}
if (! $parentNumbering) {
$this->currentNumberingIndex++; }
} elseif (isset($paragraph[ 'content' ][ 'numbering' ]) && isset($paragraph[ 'children' ]) && count($paragraph[ 'children' ])) { $paragraphs[ $index ] = $this->setChildrenNumbering($paragraphs[ $index ]); } elseif (isset($paragraphs[ $index ][ 'content' ][ 'numbering' ]) && isset(last($result)[ 'content' ][ 'numbering' ]) && $paragraphs[ $index ][ 'content' ][ 'numbering' ] == last($result)[ 'content' ][ 'numbering' ]) {
} } catch (\Exception $e) { dd($e); } $result[] = $paragraphs[ $index ];
}
return $result; }
/** * @param $parent * * @return mixed */ private function setChildrenNumbering($parent) {
$numbering = 1; for ($j = 0; $j < count($parent[ 'children' ]); $j++) { $children = $parent[ 'children' ][ $j ];
if ($children[ 'type' ] == 'listItemRun' || isset($children[ 'content' ][ 'numbering' ])) { $parentNumber = $parent[ 'content' ][ 'numbering' ]; $parent[ 'children' ][ $j ][ 'content' ][ 'numbering' ] = (substr(trim($parentNumber), strlen(trim($parentNumber)) - 1) == '.') ? $parentNumber.$numbering : $parentNumber.'.'.$numbering; if (count($parent[ 'children' ][ $j ][ 'children' ])) {
$parent[ 'children' ][ $j ] = $this->setChildrenNumbering($parent[ 'children' ][ $j ]); }
$numbering++; } }
return $parent; }
/** * @param $paragraphs * * @return array */ private function buildTheChildrens($paragraphs, $depthType) { $alreadyHandledIndexes = []; $result = [];
for ($i = 0; $i < count($paragraphs); $i++) {
if (in_array($i, $alreadyHandledIndexes)) { continue; } $j = $i + 1;
for ($j; $j < count($paragraphs); $j++) {
if (in_array($j, $alreadyHandledIndexes)) { continue; }
if (isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && $paragraphs[ $j ][ 'content' ][ 'content' ] === '<p></p>') { $alreadyHandledIndexes[] = $j; $j++; }
if (isset($paragraphs[ $i ][ $depthType ]) && isset($paragraphs[ $j ][ $depthType ]) && $paragraphs[ $i ][ $depthType ] !== null && $paragraphs[ $j ][ $depthType ] !== null && $paragraphs[ $i ][ $depthType ] < $paragraphs[ $j ][ $depthType ]) {
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i, $depthType);
} elseif (isset($paragraphs[ $j ][ 'styleName' ]) && $paragraphs[ $j ][ 'styleName' ] === 'ListParagraph' && $paragraphs[ $i ][ $depthType ] === null && substr(strip_tags($paragraphs[ $i ][ 'content' ][ 'content' ]), -1) === ':') { $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i, $depthType);
} elseif (isset($paragraphs[ $j + 1 ]) && isset($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]) && isset($paragraphs[ $j ]) && isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && substr(strip_tags($paragraphs[ $j ][ 'content' ][ 'content' ]), -1) === ':' && (isset($paragraphs[ $j + 1 ]) && ctype_lower(substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])), 0, 1)) || (isset($paragraphs[ $j + 1 ]) && substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])), strlen(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]))) - 1) == ';'))) { $k = $j + 1; $alreadyHandledIndexes[] = $k; while (isset($paragraphs[ $k ]) && substr(str_replace('and', '', trim(strip_tags(str_replace('and', '', $paragraphs[ $k ][ 'content' ][ 'content' ])))), strlen(str_replace('and', '', trim(strip_tags(str_replace('and', '', $paragraphs[ $k ][ 'content' ][ 'content' ]))))) - 1) == ';') { $paragraphs[ $j ][ 'children' ][] = $paragraphs[ $k ]; $alreadyHandledIndexes[] = $k++;
}
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i, $depthType);
} elseif (isset($paragraphs[ $i ][ 'styleName' ]) && $paragraphs[ $i ][ $depthType ] !== $paragraphs[ $j ][ $depthType ] && strpos($paragraphs[ $i ][ 'styleName' ], 'Heading2') !== false && ((isset($paragraphs[ $j ][ 'depth' ]) || ($paragraphs[ $j ][ 'type' ] == 'textRun' && isset($paragraphs[ $j ][ 'content' ][ 'numbering' ])) && is_null($paragraphs[ $j ][ 'styleName' ])))) {
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i, $depthType);
} else {
break; }
$alreadyHandledIndexes[] = $j;
} $result[] = $paragraphs[ $i ]; $alreadyHandledIndexes[] = $i;
}
return $result; }
/** * @param $parent * @param $child * @param $i * * @return mixed */ private function handlePossibleChild($parent, $child, $i, $depthType) {
// Must iterate through parent children
if (isset($parent[ 'children' ]) && count($parent[ 'children' ]) === 0) { if ($parent[ $depthType ] < $child[ $depthType ] || $parent[ $depthType ] === null) { $parent[ 'children' ][] = $child; } elseif (strpos($parent[ 'styleName' ], 'Heading') !== false && isset($child[ 'content' ][ 'numbering' ]) && substr_count($child[ 'content' ][ 'numbering' ], '.') == 1) { $parent[ 'children' ][] = $child; } else { return $parent; }
return $parent; }
$lastParentChild = last($parent[ 'children' ]); // Possible to be either child or grandchild
if ($lastParentChild[ $depthType ] && $child[ $depthType ] > $lastParentChild[ $depthType ]) {
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child, $i, $depthType);
} else {
if ($child[ $depthType ] === $lastParentChild[ $depthType ]) { $parent[ 'children' ][] = $child;
return $parent; }
if (((isset($lastParentChild[ 'styleDepth' ]) && $lastParentChild[ 'styleDepth' ] === $child[ 'depth' ])) && $lastParentChild[ 'index' ] !== $child[ 'index' ]) {
$parent[ 'children' ][] = $child;
return $parent; } }
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
return $parent;
}
}
|