Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

269 lines
10 KiB

<?php
namespace App\Parser\DocxParser;
use App\Parser\DocxParser\Traits\Helper;
use Illuminate\Support\Facades\Log;
use PhpOffice\PhpWord\IOFactory;
use function GuzzleHttp\Psr7\str;
class ParseDocx
{
use Helper;
protected $currentNumberingIndex = 1;
public function fromUploadedFile($file)
{
try {
$docxFileLoader = IOFactory::load($file);
Log::info('Parse docx');
return $this->parseLoadedDocx($docxFileLoader);
} catch (\Exception $exception) {
dd($exception);
throw new \Exception($exception->getMessage());
}
}
private function parseLoadedDocx($docx)
{
$styles = 0;
foreach ($docx->getSections() as $page) {
$handler = $this->getHandler($page);
$paragraphs = $handler->handle($page);
if ($paragraphs) {
foreach ($paragraphs as $index => $paragraph) {
try {
if ($paragraph && $paragraph[ 'type' ] !== 'textBreak' && (isset($paragraph[ 'content' ][ 'type' ]) && $paragraph[ 'content' ][ 'type' ] !== 'textBreak') || $paragraph[ 'type' ] == 'table') {
$result[] = $paragraph;
if (isset($paragraph[ 'styleName' ])) {
$styles++;
}
}
} catch (\Exception $e) {
dd($e);
}
}
}
}
$depthTypeType = count($result) / 2 <= $styles ? 'styleDepth' : 'depth';
return $this->setTheNumbering($result, null, $depthTypeType);
}
private function setTheNumbering($paragraphs, $parentNumbering = null, $depthType = 'depth')
{
$result = [];
$paragraphs = $this->buildTheChildrens($paragraphs, $depthType);
for ($index = 0; $index < count($paragraphs); $index++) {
$paragraph = $paragraphs[ $index ];
try {
if ($paragraph[ 'type' ] !== 'table' && ($paragraph[ $depthType ] === 0 || $parentNumbering) && strpos($paragraph[ 'styleName' ],
'BodyText') === false) {
$paragraph[ 'content' ][ 'numbering' ] = ($parentNumbering) ? $parentNumbering.((int) $index + 1).'.' : $this->currentNumberingIndex.'.';
$paragraph[ 'content' ][ 'numbering_row' ] = ($parentNumbering) ? ((int) $index + 1) : $this->currentNumberingIndex;
if ($paragraph[ 'children' ] && count($paragraph[ 'children' ])) {
$paragraph[ 'children' ] = $this->setTheNumbering($paragraph[ 'children' ],
$paragraph[ 'content' ][ 'numbering' ], $depthType);
}
if (! $parentNumbering) {
$this->currentNumberingIndex++;
}
} elseif (isset($paragraph[ 'content' ][ 'numbering' ]) && isset($paragraph[ 'children' ]) && count($paragraph[ 'children' ])) {
$paragraphs[ $index ] = $this->setChildrenNumbering($paragraphs[ $index ]);
} elseif (isset($paragraphs[ $index ][ 'content' ][ 'numbering' ]) && isset(last($result)[ 'content' ][ 'numbering' ]) && $paragraphs[ $index ][ 'content' ][ 'numbering' ] == last($result)[ 'content' ][ 'numbering' ]) {
}
} catch (\Exception $e) {
dd($e);
}
$result[] = $paragraphs[ $index ];
}
return $result;
}
/**
* @param $parent
*
* @return mixed
*/
private function setChildrenNumbering($parent)
{
$numbering = 1;
for ($j = 0; $j < count($parent[ 'children' ]); $j++) {
$children = $parent[ 'children' ][ $j ];
if ($children[ 'type' ] == 'listItemRun' || isset($children[ 'content' ][ 'numbering' ])) {
$parentNumber = $parent[ 'content' ][ 'numbering' ];
$parent[ 'children' ][ $j ][ 'content' ][ 'numbering' ] = (substr(trim($parentNumber),
strlen(trim($parentNumber)) - 1) == '.') ? $parentNumber.$numbering : $parentNumber.'.'.$numbering;
if (count($parent[ 'children' ][ $j ][ 'children' ])) {
$parent[ 'children' ][ $j ] = $this->setChildrenNumbering($parent[ 'children' ][ $j ]);
}
$numbering++;
}
}
return $parent;
}
/**
* @param $paragraphs
*
* @return array
*/
private function buildTheChildrens($paragraphs, $depthType)
{
$alreadyHandledIndexes = [];
$result = [];
for ($i = 0; $i < count($paragraphs); $i++) {
if (in_array($i, $alreadyHandledIndexes)) {
continue;
}
$j = $i + 1;
for ($j; $j < count($paragraphs); $j++) {
if (in_array($j, $alreadyHandledIndexes)) {
continue;
}
if (isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && $paragraphs[ $j ][ 'content' ][ 'content' ] === '<p></p>') {
$alreadyHandledIndexes[] = $j;
$j++;
}
if (isset($paragraphs[ $i ][ $depthType ]) && isset($paragraphs[ $j ][ $depthType ]) && $paragraphs[ $i ][ $depthType ] !== null && $paragraphs[ $j ][ $depthType ] !== null && $paragraphs[ $i ][ $depthType ] < $paragraphs[ $j ][ $depthType ]) {
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
$depthType);
} elseif (isset($paragraphs[ $j ][ 'styleName' ]) && $paragraphs[ $j ][ 'styleName' ] === 'ListParagraph' && $paragraphs[ $i ][ $depthType ] === null && substr(strip_tags($paragraphs[ $i ][ 'content' ][ 'content' ]),
-1) === ':') {
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
$depthType);
} elseif (isset($paragraphs[ $j + 1 ]) && isset($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]) && isset($paragraphs[ $j ]) && isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && substr(strip_tags($paragraphs[ $j ][ 'content' ][ 'content' ]),
-1) === ':' && (isset($paragraphs[ $j + 1 ]) && ctype_lower(substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])),
0,
1)) || (isset($paragraphs[ $j + 1 ]) && substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])),
strlen(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]))) - 1) == ';'))) {
$k = $j + 1;
$alreadyHandledIndexes[] = $k;
while (isset($paragraphs[ $k ]) && substr(str_replace('and', '',
trim(strip_tags(str_replace('and', '', $paragraphs[ $k ][ 'content' ][ 'content' ])))),
strlen(str_replace('and', '', trim(strip_tags(str_replace('and', '',
$paragraphs[ $k ][ 'content' ][ 'content' ]))))) - 1) == ';') {
$paragraphs[ $j ][ 'children' ][] = $paragraphs[ $k ];
$alreadyHandledIndexes[] = $k++;
}
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
$depthType);
} elseif (isset($paragraphs[ $i ][ 'styleName' ]) && $paragraphs[ $i ][ $depthType ] !== $paragraphs[ $j ][ $depthType ] && strpos($paragraphs[ $i ][ 'styleName' ],
'Heading2') !== false && ((isset($paragraphs[ $j ][ 'depth' ]) || ($paragraphs[ $j ][ 'type' ] == 'textRun' && isset($paragraphs[ $j ][ 'content' ][ 'numbering' ])) && is_null($paragraphs[ $j ][ 'styleName' ])))) {
$paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i,
$depthType);
} else {
break;
}
$alreadyHandledIndexes[] = $j;
}
$result[] = $paragraphs[ $i ];
$alreadyHandledIndexes[] = $i;
}
return $result;
}
/**
* @param $parent
* @param $child
* @param $i
*
* @return mixed
*/
private function handlePossibleChild($parent, $child, $i, $depthType)
{
// Must iterate through parent children
if (isset($parent[ 'children' ]) && count($parent[ 'children' ]) === 0) {
if ($parent[ $depthType ] < $child[ $depthType ] || $parent[ $depthType ] === null) {
$parent[ 'children' ][] = $child;
} elseif (strpos($parent[ 'styleName' ],
'Heading') !== false && isset($child[ 'content' ][ 'numbering' ]) && substr_count($child[ 'content' ][ 'numbering' ],
'.') == 1) {
$parent[ 'children' ][] = $child;
} else {
return $parent;
}
return $parent;
}
$lastParentChild = last($parent[ 'children' ]);
// Possible to be either child or grandchild
if ($lastParentChild[ $depthType ] && $child[ $depthType ] > $lastParentChild[ $depthType ]) {
$lastParentChild = $this->handlePossibleChild($lastParentChild, $child, $i, $depthType);
} else {
if ($child[ $depthType ] === $lastParentChild[ $depthType ]) {
$parent[ 'children' ][] = $child;
return $parent;
}
if (((isset($lastParentChild[ 'styleDepth' ]) && $lastParentChild[ 'styleDepth' ] === $child[ 'depth' ])) && $lastParentChild[ 'index' ] !== $child[ 'index' ]) {
$parent[ 'children' ][] = $child;
return $parent;
}
}
$parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild;
return $parent;
}
}