From 59edf696ade695cd5800edaa627296a22fcd1b71 Mon Sep 17 00:00:00 2001 From: Orzu Ionut Date: Wed, 8 Sep 2021 16:31:43 +0300 Subject: [PATCH] S&D on original document recreates the original document format. Fix PDF issue. --- app/Ingest/DataJsonConvertor.php | 4 + app/Ingest/Office.php | 14 ++ app/Jobs/RecreateDocument.php | 211 +------------------------------ 3 files changed, 25 insertions(+), 204 deletions(-) diff --git a/app/Ingest/DataJsonConvertor.php b/app/Ingest/DataJsonConvertor.php index 32e2ef1..fcb66de 100644 --- a/app/Ingest/DataJsonConvertor.php +++ b/app/Ingest/DataJsonConvertor.php @@ -67,6 +67,10 @@ class DataJsonConvertor extends AbstractConvertor $this->setPath(str_replace($this->type, 'docx', $this->path)); + if ( ! $this->storage->exists($this->path)) { + throw new \Exception('Failed when converting from ' . $this->type . ' to DOCX for file: ' . $this->path . '. The DOCX file doesnt exist.'); + } + $this->type = 'docx'; } } diff --git a/app/Ingest/Office.php b/app/Ingest/Office.php index 4581dfd..45413e2 100644 --- a/app/Ingest/Office.php +++ b/app/Ingest/Office.php @@ -31,8 +31,15 @@ class Office protected function runConversion($convertTo, $filePath, $directoryPath) { + $pdfAdditionalOption = ''; + + if ($this->fileIsPDF($filePath)) { + $pdfAdditionalOption = "--infilter='writer_pdf_import'"; + } + $process = new Process([ 'soffice', + $pdfAdditionalOption, '--accept="pipe,name=soffice-pipe-' . $this->id . ';urp;StarOffice.ServiceMananger"', '-env:UserInstallation=file:///tmp/' . $this->directory, '--headless', @@ -50,6 +57,13 @@ class Office return $process->isSuccessful(); } + protected function fileIsPDF($filePath) + { + $s = '.pdf'; + + return substr($filePath, - strlen($s)) === $s; + } + protected function makeTemporaryDirectory() { (new Process(['mkdir /tmp/' . $this->directory]))->run(); diff --git a/app/Jobs/RecreateDocument.php b/app/Jobs/RecreateDocument.php index 998bb55..e7d9a0d 100644 --- a/app/Jobs/RecreateDocument.php +++ b/app/Jobs/RecreateDocument.php @@ -77,226 +77,29 @@ class RecreateDocument implements ShouldQueue ); } - $this->data['elements'] = $this->updateText($this->data['elements'], $textMapper)['elements']; + $this->data['elements'] = $this->updateText($this->data['elements'], $textMapper); } - protected function updateText($elements, $textMapper, $parentElement = null) + protected function updateText($elements, $textMapper) { - $stoppedAtIndex = null; - $collectionToAppend = []; - - for ($index = 0; $index < count($elements); $index++) { - $element = $elements[$index]; - + foreach ($elements as $index => $element) { if (array_key_exists('hash', $element)) { - $result = $this->processText($element, $textMapper); - - if ( ! $result['is_collection']) { - $elements[$index] = $result['element']; - } else { - if ($parentElement && $parentElement['element_type'] === 'TextRun') { - $stoppedAtIndex = $index; - $collectionToAppend = $result['elements']; - - break; - } else { - // We have a collection of elements instead of one, so we must push its siblings to make room. - $numberOfPlacesToMove = count($result['elements']) - 1; - - // Move siblings to make room for collection. - for ($i = count($elements) - 1; $i > $index; $i--) { - $elements[$i + $numberOfPlacesToMove] = $elements[$i]; - } - - foreach ($result['elements'] as $i => $collectionElement) { - $elements[$index + $i] = $collectionElement; - } - - $index += $numberOfPlacesToMove; - } - } + $elements[$index]['text'] = $textMapper[$element['hash']]; } if ( array_key_exists('text_object', $element) && array_key_exists('text', $element['text_object']) ) { -// $elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']]; - $result = $this->processText($element['text_object'], $textMapper); - - if ( ! $result['is_collection']) { - $elements[$index]['text_object'] = $result['element']; - } else { -// if ($parentElement && $parentElement['element_type'] === 'TextRun') { -// $stoppedAtIndex = $index; -// $collectionToAppend = $result['elements']; -// -// break; -// } else { -// // We have a collection of elements instead of one, so we must push its siblings to make room. -// $numberOfPlacesToMove = count($result['elements']) - 1; -// -// // Move siblings to make room for collection. -// for ($i = count($elements) - 1; $i > $index; $i--) { -// $elements[$i + $numberOfPlacesToMove] = $elements[$i]; -// } -// -// foreach ($result['elements'] as $i => $element) { -// $elements[$index + $i] = $element; -// } -// -// $index += $numberOfPlacesToMove; -// } - } + $elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']]; } if (isset($elements[$index]['elements'])) { - $result = $this->updateText($elements[$index]['elements'], $textMapper, $element); - - $elements[$index]['elements'] = $result['elements']; - - if ($result['has_stopped']) { - // One of the child has become a 'TextRun' and the current $element is also a 'TextRun' - // so the child will become a sibling of the $element. - - // Start from the end of the array and move elements. - $numberOfPlacesToMove = count($result['collection_to_append']) + count($result['unprocessed_elements']); - - for ($i = count($elements) - 1; $i > $index; $i--) { - $elements[$numberOfPlacesToMove + $i] = $elements[$i]; - } - - foreach ($result['collection_to_append'] as $i => $collectionElement) { - $elements[$index + 1 + $i] = $collectionElement; - } - - // 4 5 6 - - $elements[$index + 1] = $result['text_run_element']; - - foreach ($result['unprocessed_elements'] as $i => $unprocessedElement) { - $elements[$index + count($result['collection_to_append']) + $i] = $unprocessedElement; - } - - // Skip 'text_run_element' processing. - $index += 1; - } - } - } - - if ($stoppedAtIndex === null) { - return [ - 'has_stopped' => false, - 'elements' => $elements, - ]; - } - - $remainingElements = array_splice($elements, 0, $stoppedAtIndex); - $unprocessedElements = array_splice($elements, 1); - - return [ - 'has_stopped' => true, - 'elements' => $remainingElements, - 'collection_to_append' => $collectionToAppend, - 'unprocessed_elements' => $unprocessedElements, - ]; - } - - protected function processText($element, $textMapper) - { - $text = $textMapper[$element['hash']]; - - if ($element['text'] === $text) { - return [ - 'is_collection' => false, - 'element' => $element, - ]; - } - - $textWithDisplacement = $text; - - preg_match_all('/{[^\/][^{}]*}/', $textWithDisplacement, $accoladeMatches); - - // Nothing found. - if (count($accoladeMatches[0]) === 0) { - $element['text'] = $textWithDisplacement; - - return [ - 'is_collection' => false, - 'element' => $element, - ]; - } - - // In order to add bookmarks we can create a list of parent element and bookmark element which will be appended - // at the end of the elements list, so it will be created in the net steps. - - $elements = []; - - $textContents = $textWithDisplacement; - - $accoladeMatches = array_unique($accoladeMatches[0]); - - foreach ($accoladeMatches as $accoladeMatch) { - $textMatch = substr($accoladeMatch, 1, strlen($accoladeMatch) - 2); - - $pattern = '/(' . $accoladeMatch . ')([^{}\/]+)({\/' . $textMatch . '})/'; - - preg_match_all($pattern, $textWithDisplacement, $matches); - - if (($matchesCount = count($matches[0])) > 0) { - for ($i = 0; $i < $matchesCount; $i++) { - $foundText = $matches[0][$i]; - $displacement = $matches[1][$i]; - $textWithoutDisplacement = trim($matches[2][$i]); - - $index = strpos($textContents, $foundText); - $plainText = substr($textContents, 0, $index); - - if ($plainText) { - $elements[] = [ - 'element_type' => 'Text', - 'text' => $plainText, - 'font_style' => $element['font_style'], - 'paragraph_style' => $element['paragraph_style'], - ]; - } - - $elements[] = [ - 'element_type' => 'TextRun', - 'paragraph_style' => $element['paragraph_style'], - 'elements' => [ - [ - 'element_type' => 'Text', - 'text' => $textWithoutDisplacement, - 'font_style' => $element['font_style'], - 'paragraph_style' => $element['paragraph_style'], - ], - - [ - 'element_type' => 'Bookmark', - 'name' => substr($displacement, 1, strlen($displacement) - 2), - ] - ], - ]; - - $textContents = substr($textContents, $index + strlen($foundText)); - } + $elements[$index]['elements'] = $this->updateText($elements[$index]['elements'], $textMapper); } } - if ($textContents) { - $elements[] = [ - 'element_type' => 'Text', - 'text' => $textContents, - 'font_style' => $element['font_style'], - 'paragraph_style' => $element['paragraph_style'], - ]; - } - - return [ - 'is_collection' => true, - 'elements' => $elements, - ]; + return $elements; } protected function createDocx()