You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
269 lines
10 KiB
269 lines
10 KiB
<?php
|
|
|
|
namespace App\Parser\DocxParser;
|
|
|
|
use App\Parser\DocxParser\Traits\Helper;
|
|
use Illuminate\Support\Facades\Log;
|
|
use PhpOffice\PhpWord\IOFactory;
|
|
use function GuzzleHttp\Psr7\str;
|
|
|
|
class ParseDocx
|
|
{
|
|
|
|
use Helper;
|
|
|
|
protected $currentNumberingIndex = 1;
|
|
|
|
|
|
public function fromUploadedFile($file)
|
|
{
|
|
try {
|
|
$docxFileLoader = IOFactory::load($file);
|
|
Log::info('Parse docx');
|
|
|
|
return $this->parseLoadedDocx($docxFileLoader);
|
|
} catch (\Exception $exception) {
|
|
dd($exception);
|
|
throw new \Exception($exception->getMessage());
|
|
}
|
|
|
|
}
|
|
|
|
|
|
private function parseLoadedDocx($docx)
|
|
{
|
|
$styles = 0;
|
|
foreach ($docx->getSections() as $page) {
|
|
|
|
$handler = $this->getHandler($page);
|
|
$paragraphs = $handler->handle($page);
|
|
if ($paragraphs) {
|
|
foreach ($paragraphs as $index => $paragraph) {
|
|
try {
|
|
if ($paragraph && $paragraph[ 'type' ] !== 'textBreak' && (isset($paragraph[ 'content' ][ 'type' ]) && $paragraph[ 'content' ][ 'type' ] !== 'textBreak') || $paragraph[ 'type' ] == 'table') {
|
|
$result[] = $paragraph;
|
|
if (isset($paragraph[ 'styleName' ])) {
|
|
$styles++;
|
|
}
|
|
}
|
|
} catch (\Exception $e) {
|
|
dd($e);
|
|
}
|
|
|
|
}
|
|
}
|
|
}
|
|
$depthTypeType = count($result) / 2 <= $styles ? 'styleDepth' : 'depth';
|
|
|
|
return $this->setTheNumbering($result, null, $depthTypeType);
|
|
}
|
|
|
|
|
|
private function setTheNumbering($paragraphs, $parentNumbering = null, $depthType = 'depth')
|
|
{
|
|
$result = [];
|
|
$paragraphs = $this->buildTheChildrens($paragraphs, $depthType);
|
|
for ($index = 0; $index < count($paragraphs); $index++) {
|
|
$paragraph = $paragraphs[ $index ];
|
|
try {
|
|
if ($paragraph[ 'type' ] !== 'table' && ($paragraph[ $depthType ] === 0 || $parentNumbering) && strpos($paragraph[ 'styleName' ],
|
|
'BodyText') === false) {
|
|
|
|
$paragraph[ 'content' ][ 'numbering' ] = ($parentNumbering) ? $parentNumbering.((int) $index + 1).'.' : $this->currentNumberingIndex.'.';
|
|
$paragraph[ 'content' ][ 'numbering_row' ] = ($parentNumbering) ? ((int) $index + 1) : $this->currentNumberingIndex;
|
|
|
|
if ($paragraph[ 'children' ] && count($paragraph[ 'children' ])) {
|
|
$paragraph[ 'children' ] = $this->setTheNumbering($paragraph[ 'children' ],
|
|
$paragraph[ 'content' ][ 'numbering' ], $depthType);
|
|
|
|
}
|
|
|
|
if (! $parentNumbering) {
|
|
|
|
$this->currentNumberingIndex++;
|
|
}
|
|
|
|
|
|
} elseif (isset($paragraph[ 'content' ][ 'numbering' ]) && isset($paragraph[ 'children' ]) && count($paragraph[ 'children' ])) {
|
|
$paragraphs[ $index ] = $this->setChildrenNumbering($paragraphs[ $index ]);
|
|
} elseif (isset($paragraphs[ $index ][ 'content' ][ 'numbering' ]) && isset(last($result)[ 'content' ][ 'numbering' ]) && $paragraphs[ $index ][ 'content' ][ 'numbering' ] == last($result)[ 'content' ][ 'numbering' ]) {
|
|
|
|
|
|
}
|
|
} catch (\Exception $e) {
|
|
dd($e);
|
|
}
|
|
$result[] = $paragraphs[ $index ];
|
|
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
|
|
/**
|
|
* @param $parent
|
|
*
|
|
* @return mixed
|
|
*/
|
|
private function setChildrenNumbering($parent)
|
|
{
|
|
|
|
$numbering = 1;
|
|
for ($j = 0; $j < count($parent[ 'children' ]); $j++) {
|
|
$children = $parent[ 'children' ][ $j ];
|
|
|
|
if ($children[ 'type' ] == 'listItemRun' || isset($children[ 'content' ][ 'numbering' ])) {
|
|
$parentNumber = $parent[ 'content' ][ 'numbering' ];
|
|
$parent[ 'children' ][ $j ][ 'content' ][ 'numbering' ] = (substr(trim($parentNumber),
|
|
strlen(trim($parentNumber)) - 1) == '.') ? $parentNumber.$numbering : $parentNumber.'.'.$numbering;
|
|
if (count($parent[ 'children' ][ $j ][ 'children' ])) {
|
|
|
|
$parent[ 'children' ][ $j ] = $this->setChildrenNumbering($parent[ 'children' ][ $j ]);
|
|
}
|
|
|
|
$numbering++;
|
|
}
|
|
}
|
|
|
|
return $parent;
|
|
}
|
|
|
|
|
|
/**
|
|
* @param $paragraphs
|
|
*
|
|
* @return array
|
|
*/
|
|
private function buildTheChildrens($paragraphs, $depthType)
|
|
{
|
|
$alreadyHandledIndexes = [];
|
|
$result = [];
|
|
|
|
for ($i = 0; $i < count($paragraphs); $i++) {
|
|
|
|
if (in_array($i, $alreadyHandledIndexes)) {
|
|
continue;
|
|
}
|
|
$j = $i + 1;
|
|
|
|
for ($j; $j < count($paragraphs); $j++) {
|
|
|
|
if (in_array($j, $alreadyHandledIndexes)) {
|
|
continue;
|
|
}
|
|
|
|
if (isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && $paragraphs[ $j ][ 'content' ][ 'content' ] === '<p></p>') {
|
|
$alreadyHandledIndexes[] = $j;
|
|
$j++;
|
|
}
|
|
|
|
if (isset($paragraphs[ $i ][ $depthType ]) && isset($paragraphs[ $j ][ $depthType ]) && $paragraphs[ $i ][ $depthType ] !== null && $paragraphs[ $j ][ $depthType ] !== null && $paragraphs[ $i ][ $depthType ] < $paragraphs[ $j ][ $depthType ]) {
|
|
|
|
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
|
|
$depthType);
|
|
|
|
|
|
} elseif (isset($paragraphs[ $j ][ 'styleName' ]) && $paragraphs[ $j ][ 'styleName' ] === 'ListParagraph' && $paragraphs[ $i ][ $depthType ] === null && substr(strip_tags($paragraphs[ $i ][ 'content' ][ 'content' ]),
|
|
-1) === ':') {
|
|
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
|
|
$depthType);
|
|
|
|
} elseif (isset($paragraphs[ $j + 1 ]) && isset($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]) && isset($paragraphs[ $j ]) && isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && substr(strip_tags($paragraphs[ $j ][ 'content' ][ 'content' ]),
|
|
-1) === ':' && (isset($paragraphs[ $j + 1 ]) && ctype_lower(substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])),
|
|
0,
|
|
1)) || (isset($paragraphs[ $j + 1 ]) && substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])),
|
|
strlen(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]))) - 1) == ';'))) {
|
|
$k = $j + 1;
|
|
$alreadyHandledIndexes[] = $k;
|
|
while (isset($paragraphs[ $k ]) && substr(str_replace('and', '',
|
|
trim(strip_tags(str_replace('and', '', $paragraphs[ $k ][ 'content' ][ 'content' ])))),
|
|
strlen(str_replace('and', '', trim(strip_tags(str_replace('and', '',
|
|
$paragraphs[ $k ][ 'content' ][ 'content' ]))))) - 1) == ';') {
|
|
$paragraphs[ $j ][ 'children' ][] = $paragraphs[ $k ];
|
|
$alreadyHandledIndexes[] = $k++;
|
|
|
|
}
|
|
|
|
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
|
|
$depthType);
|
|
|
|
|
|
} elseif (isset($paragraphs[ $i ][ 'styleName' ]) && $paragraphs[ $i ][ $depthType ] !== $paragraphs[ $j ][ $depthType ] && strpos($paragraphs[ $i ][ 'styleName' ],
|
|
'Heading2') !== false && ((isset($paragraphs[ $j ][ 'depth' ]) || ($paragraphs[ $j ][ 'type' ] == 'textRun' && isset($paragraphs[ $j ][ 'content' ][ 'numbering' ])) && is_null($paragraphs[ $j ][ 'styleName' ])))) {
|
|
|
|
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
|
|
$depthType);
|
|
|
|
|
|
} else {
|
|
|
|
break;
|
|
}
|
|
|
|
$alreadyHandledIndexes[] = $j;
|
|
|
|
}
|
|
$result[] = $paragraphs[ $i ];
|
|
$alreadyHandledIndexes[] = $i;
|
|
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
|
|
/**
|
|
* @param $parent
|
|
* @param $child
|
|
* @param $i
|
|
*
|
|
* @return mixed
|
|
*/
|
|
private function handlePossibleChild($parent, $child, $i, $depthType)
|
|
{
|
|
|
|
// Must iterate through parent children
|
|
if (isset($parent[ 'children' ]) && count($parent[ 'children' ]) === 0) {
|
|
if ($parent[ $depthType ] < $child[ $depthType ] || $parent[ $depthType ] === null) {
|
|
$parent[ 'children' ][] = $child;
|
|
} elseif (strpos($parent[ 'styleName' ],
|
|
'Heading') !== false && isset($child[ 'content' ][ 'numbering' ]) && substr_count($child[ 'content' ][ 'numbering' ],
|
|
'.') == 1) {
|
|
$parent[ 'children' ][] = $child;
|
|
} else {
|
|
return $parent;
|
|
}
|
|
|
|
return $parent;
|
|
}
|
|
|
|
$lastParentChild = last($parent[ 'children' ]);
|
|
// Possible to be either child or grandchild
|
|
if ($lastParentChild[ $depthType ] && $child[ $depthType ] > $lastParentChild[ $depthType ]) {
|
|
|
|
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child, $i, $depthType);
|
|
|
|
} else {
|
|
|
|
if ($child[ $depthType ] === $lastParentChild[ $depthType ]) {
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
return $parent;
|
|
}
|
|
|
|
if (((isset($lastParentChild[ 'styleDepth' ]) && $lastParentChild[ 'styleDepth' ] === $child[ 'depth' ])) && $lastParentChild[ 'index' ] !== $child[ 'index' ]) {
|
|
|
|
$parent[ 'children' ][] = $child;
|
|
|
|
return $parent;
|
|
}
|
|
}
|
|
|
|
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
|
|
|
|
return $parent;
|
|
|
|
}
|
|
|
|
}
|