diff --git a/app/Http/Controllers/IngestController.php b/app/Http/Controllers/IngestController.php index 25e603d..ca2200b 100644 --- a/app/Http/Controllers/IngestController.php +++ b/app/Http/Controllers/IngestController.php @@ -11,6 +11,7 @@ class IngestController extends Controller request()->validate([ 'id' => 'required', 'file_result_type' => 'required|in:md,original', + 'mime_type' => 'required', 'document' => 'required|file', ]); diff --git a/app/Http/Controllers/RecreateDocumentController.php b/app/Http/Controllers/RecreateDocumentController.php index f5835e4..d9eb655 100644 --- a/app/Http/Controllers/RecreateDocumentController.php +++ b/app/Http/Controllers/RecreateDocumentController.php @@ -46,7 +46,7 @@ class RecreateDocumentController extends Controller return ''; } - return response()->download($storage->path($fullPath), 'document.docx', []) + return response()->download($storage->path($fullPath), $filePath, []) ->deleteFileAfterSend(true); } } diff --git a/app/Ingest/AbstractConvertor.php b/app/Ingest/AbstractConvertor.php index b2539e8..689dbf6 100644 --- a/app/Ingest/AbstractConvertor.php +++ b/app/Ingest/AbstractConvertor.php @@ -17,6 +17,11 @@ abstract class AbstractConvertor abstract public function execute(); + public function getPath() + { + return $this->path; + } + public function setPath($path) { $this->path = $path; diff --git a/app/Ingest/DocxConvertor.php b/app/Ingest/DocxConvertor.php index d5ca303..b1e179b 100644 --- a/app/Ingest/DocxConvertor.php +++ b/app/Ingest/DocxConvertor.php @@ -15,13 +15,11 @@ class DocxConvertor extends AbstractConvertor { $this->convertToPdfWithLibreOffice(); - $pdfFilePath = "$this->directoryPath/document.pdf"; - - if ( ! $this->storage->exists($pdfFilePath)) { - throw new \Exception('Failed to convert to PDF: ' . $pdfFilePath); + if ( ! $this->storage->exists($this->path)) { + throw new \Exception('Failed to convert to PDF: ' . $this->path); } - $convertor = new PDFConvertor($this->storage, $pdfFilePath); + $convertor = new PDFConvertor($this->storage, $this->path); $convertor->execute(); } @@ -53,20 +51,67 @@ class DocxConvertor extends AbstractConvertor * * @throws \Exception */ - protected function convertToPdfWithLibreOffice() + public function convertToPdfWithLibreOffice() + { + $this->convertToFormat('pdf'); + } + + /** + * + * @throws \Exception + */ + public function convertToODT() + { + $this->convertToFormat('odt'); + } + + /** + * + * @throws \Exception + */ + public function convertToRTF() + { + $this->convertToFormat('rtf'); + } + + /** + * + * @throws \Exception + */ + public function convertToDOC() + { + $this->convertToFormat('doc'); + } + + /** + * + * @throws \Exception + */ + public function convertToTXT() + { + $this->convertToFormat('txt'); + } + + /** + * + * @throws \Exception + */ + protected function convertToFormat($format) { $office = new Office(); $success = $office->run( - 'pdf', + $format, $this->storage->path($this->path), $this->storage->path($this->directoryPath) ); if (! $success) { - throw new \Exception('Failed when converting from DOCX to PDF for file: ' . $this->path); + throw new \Exception('Failed when converting from DOCX to ' . strtoupper($format) . ' for file: ' . $this->path); } $this->deleteOriginalDocument(); + + $this->path = "$this->directoryPath/document.$format"; } } diff --git a/app/Ingest/DocxReader.php b/app/Ingest/DocxReader.php index feb7753..c8be1cc 100644 --- a/app/Ingest/DocxReader.php +++ b/app/Ingest/DocxReader.php @@ -49,15 +49,21 @@ class DocxReader extends AbstractConvertor // Converting to HTML and then back to DOCX loses some content and styles (lost when converted to HTML). $data = []; -// $extension = pathinfo($this->path)['extension']; -// $readerName = array_key_exists($extension, $this->readersMapper) -// ? $this->readersMapper[$extension] -// : 'Word2007'; - -// $handler = IOFactory::load($this->storage->path($this->path), $readerName); - $handler = IOFactory::load($this->storage->path($this->path)); + /** + * @ISSUE + * At the moment of this writing (08/sept/2021) phpword does not support reading bookmarks from the + * DOCX file, in order to add the support we can, for example, add the following lines of + * code to the AbstractPart.php file before the Text and TextRun handling case. + * + * $els = $xmlReader->getElements('w:bookmarkStart', $domNode); + * if ($els && $els->count() > 0) { + * $parent->addBookmark($els[0]->getAttribute('w:name')); + * } + * + */ + $data['default_font_name'] = $handler->getDefaultFontName(); $data['default_font_size'] = $handler->getDefaultFontSize(); $data['styles'] = $this->handleStyles(Style::getStyles()); diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php index f9bbdc0..413e309 100644 --- a/app/Jobs/IngestDocuments.php +++ b/app/Jobs/IngestDocuments.php @@ -76,7 +76,7 @@ class IngestDocuments implements ShouldQueue $directoryPath = pathinfo($this->path, PATHINFO_DIRNAME); if ($this->fromRequest) { - SendToCore::dispatch($this->id, $this->fileResultType, $directoryPath); + SendToCore::dispatch($this->id, $this->fileResultType, $this->type, $directoryPath); return; } @@ -132,7 +132,7 @@ class IngestDocuments implements ShouldQueue $directoryPath = pathinfo($this->path, PATHINFO_DIRNAME); if ($this->fromRequest) { - SendToCore::dispatch($this->id, $this->fileResultType, $directoryPath, true); + SendToCore::dispatch($this->id, $this->fileResultType, $this->type, $directoryPath, true); return; } diff --git a/app/Jobs/RecreateDocument.php b/app/Jobs/RecreateDocument.php index 03f84fe..998bb55 100644 --- a/app/Jobs/RecreateDocument.php +++ b/app/Jobs/RecreateDocument.php @@ -2,6 +2,7 @@ namespace App\Jobs; +use App\Ingest\DocxConvertor; use App\Ingest\DocxWriter; use Illuminate\Bus\Queueable; use Illuminate\Contracts\Queue\ShouldQueue; @@ -19,6 +20,7 @@ class RecreateDocument implements ShouldQueue protected $id; protected $data; protected $storage; + protected $path; protected $url; protected $secret; @@ -33,6 +35,8 @@ class RecreateDocument implements ShouldQueue $this->data = $data; $this->storage = Storage::disk('local'); + $this->path = ''; + $this->url = env('WEBHOOK_CORE_URL') . '/webhooks'; $this->secret = env('WEBHOOK_CORE_SECRET'); } @@ -49,7 +53,7 @@ class RecreateDocument implements ShouldQueue $this->createDocx(); - // Convert to original format, either PDF, ODT, etc. + $this->convertToOriginalDocumentFormat(); $this->sendResponse('success'); } catch (\Exception $exception) { @@ -66,36 +70,233 @@ class RecreateDocument implements ShouldQueue $textMapper = []; foreach ($this->data['contents']['elements'] as $element) { - $textMapper[$element['hash']] = substr( + $textMapper[$element['hash']] =substr( $text, $element['range_start'], $element['range_end'] - $element['range_start'] + 1 ); } - $this->data['elements'] = $this->updateText($this->data['elements'], $textMapper); + $this->data['elements'] = $this->updateText($this->data['elements'], $textMapper)['elements']; } - protected function updateText($elements, $textMapper) + protected function updateText($elements, $textMapper, $parentElement = null) { - foreach ($elements as $index => $element) { + $stoppedAtIndex = null; + $collectionToAppend = []; + + for ($index = 0; $index < count($elements); $index++) { + $element = $elements[$index]; + if (array_key_exists('hash', $element)) { - $elements[$index]['text'] = $textMapper[$element['hash']]; + $result = $this->processText($element, $textMapper); + + if ( ! $result['is_collection']) { + $elements[$index] = $result['element']; + } else { + if ($parentElement && $parentElement['element_type'] === 'TextRun') { + $stoppedAtIndex = $index; + $collectionToAppend = $result['elements']; + + break; + } else { + // We have a collection of elements instead of one, so we must push its siblings to make room. + $numberOfPlacesToMove = count($result['elements']) - 1; + + // Move siblings to make room for collection. + for ($i = count($elements) - 1; $i > $index; $i--) { + $elements[$i + $numberOfPlacesToMove] = $elements[$i]; + } + + foreach ($result['elements'] as $i => $collectionElement) { + $elements[$index + $i] = $collectionElement; + } + + $index += $numberOfPlacesToMove; + } + } } if ( array_key_exists('text_object', $element) && array_key_exists('text', $element['text_object']) ) { - $elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']]; +// $elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']]; + $result = $this->processText($element['text_object'], $textMapper); + + if ( ! $result['is_collection']) { + $elements[$index]['text_object'] = $result['element']; + } else { +// if ($parentElement && $parentElement['element_type'] === 'TextRun') { +// $stoppedAtIndex = $index; +// $collectionToAppend = $result['elements']; +// +// break; +// } else { +// // We have a collection of elements instead of one, so we must push its siblings to make room. +// $numberOfPlacesToMove = count($result['elements']) - 1; +// +// // Move siblings to make room for collection. +// for ($i = count($elements) - 1; $i > $index; $i--) { +// $elements[$i + $numberOfPlacesToMove] = $elements[$i]; +// } +// +// foreach ($result['elements'] as $i => $element) { +// $elements[$index + $i] = $element; +// } +// +// $index += $numberOfPlacesToMove; +// } + } } if (isset($elements[$index]['elements'])) { - $elements[$index]['elements'] = $this->updateText($elements[$index]['elements'], $textMapper); + $result = $this->updateText($elements[$index]['elements'], $textMapper, $element); + + $elements[$index]['elements'] = $result['elements']; + + if ($result['has_stopped']) { + // One of the child has become a 'TextRun' and the current $element is also a 'TextRun' + // so the child will become a sibling of the $element. + + // Start from the end of the array and move elements. + $numberOfPlacesToMove = count($result['collection_to_append']) + count($result['unprocessed_elements']); + + for ($i = count($elements) - 1; $i > $index; $i--) { + $elements[$numberOfPlacesToMove + $i] = $elements[$i]; + } + + foreach ($result['collection_to_append'] as $i => $collectionElement) { + $elements[$index + 1 + $i] = $collectionElement; + } + + // 4 5 6 + + $elements[$index + 1] = $result['text_run_element']; + + foreach ($result['unprocessed_elements'] as $i => $unprocessedElement) { + $elements[$index + count($result['collection_to_append']) + $i] = $unprocessedElement; + } + + // Skip 'text_run_element' processing. + $index += 1; + } + } + } + + if ($stoppedAtIndex === null) { + return [ + 'has_stopped' => false, + 'elements' => $elements, + ]; + } + + $remainingElements = array_splice($elements, 0, $stoppedAtIndex); + $unprocessedElements = array_splice($elements, 1); + + return [ + 'has_stopped' => true, + 'elements' => $remainingElements, + 'collection_to_append' => $collectionToAppend, + 'unprocessed_elements' => $unprocessedElements, + ]; + } + + protected function processText($element, $textMapper) + { + $text = $textMapper[$element['hash']]; + + if ($element['text'] === $text) { + return [ + 'is_collection' => false, + 'element' => $element, + ]; + } + + $textWithDisplacement = $text; + + preg_match_all('/{[^\/][^{}]*}/', $textWithDisplacement, $accoladeMatches); + + // Nothing found. + if (count($accoladeMatches[0]) === 0) { + $element['text'] = $textWithDisplacement; + + return [ + 'is_collection' => false, + 'element' => $element, + ]; + } + + // In order to add bookmarks we can create a list of parent element and bookmark element which will be appended + // at the end of the elements list, so it will be created in the net steps. + + $elements = []; + + $textContents = $textWithDisplacement; + + $accoladeMatches = array_unique($accoladeMatches[0]); + + foreach ($accoladeMatches as $accoladeMatch) { + $textMatch = substr($accoladeMatch, 1, strlen($accoladeMatch) - 2); + + $pattern = '/(' . $accoladeMatch . ')([^{}\/]+)({\/' . $textMatch . '})/'; + + preg_match_all($pattern, $textWithDisplacement, $matches); + + if (($matchesCount = count($matches[0])) > 0) { + for ($i = 0; $i < $matchesCount; $i++) { + $foundText = $matches[0][$i]; + $displacement = $matches[1][$i]; + $textWithoutDisplacement = trim($matches[2][$i]); + + $index = strpos($textContents, $foundText); + $plainText = substr($textContents, 0, $index); + + if ($plainText) { + $elements[] = [ + 'element_type' => 'Text', + 'text' => $plainText, + 'font_style' => $element['font_style'], + 'paragraph_style' => $element['paragraph_style'], + ]; + } + + $elements[] = [ + 'element_type' => 'TextRun', + 'paragraph_style' => $element['paragraph_style'], + 'elements' => [ + [ + 'element_type' => 'Text', + 'text' => $textWithoutDisplacement, + 'font_style' => $element['font_style'], + 'paragraph_style' => $element['paragraph_style'], + ], + + [ + 'element_type' => 'Bookmark', + 'name' => substr($displacement, 1, strlen($displacement) - 2), + ] + ], + ]; + + $textContents = substr($textContents, $index + strlen($foundText)); + } } } - return $elements; + if ($textContents) { + $elements[] = [ + 'element_type' => 'Text', + 'text' => $textContents, + 'font_style' => $element['font_style'], + 'paragraph_style' => $element['paragraph_style'], + ]; + } + + return [ + 'is_collection' => true, + 'elements' => $elements, + ]; } protected function createDocx() @@ -105,6 +306,42 @@ class RecreateDocument implements ShouldQueue $writer = new DocxWriter($this->storage, $path); $writer->execute($this->data); + + $this->path = $path; + } + + /** + * @throws \Exception + */ + protected function convertToOriginalDocumentFormat() + { + if ($this->data['document_format'] === 'docx') { + return; + } + + $convertor = new DocxConvertor($this->storage, $this->path); + + if ($this->data['document_format'] === 'pdf') { + $convertor->convertToPdfWithLibreOffice(); + } + + if ($this->data['document_format'] === 'odt') { + $convertor->convertToODT(); + } + + if ($this->data['document_format'] === 'rtf') { + $convertor->convertToRTF(); + } + + if ($this->data['document_format'] === 'doc') { + $convertor->convertToDOC(); + } + + if ($this->data['document_format'] === 'txt') { + $convertor->convertToTXT(); + } + + $this->path = $convertor->getPath(); } protected function sendResponse($status) @@ -116,6 +353,7 @@ class RecreateDocument implements ShouldQueue 'id' => $this->id, 'content' => '', 'file_result_type' => 'document-recreated', + 'document_format' => $this->data['document_format'], 'status' => $status, ]]) ->useSecret($this->secret) diff --git a/app/Jobs/SendToCore.php b/app/Jobs/SendToCore.php index 7d7db65..434c77b 100644 --- a/app/Jobs/SendToCore.php +++ b/app/Jobs/SendToCore.php @@ -18,6 +18,7 @@ class SendToCore implements ShouldQueue protected $secret; protected $directoryPath; protected $fileResultType; + protected $documentFormat; protected $id; protected $hasFailed; @@ -31,17 +32,19 @@ class SendToCore implements ShouldQueue * * @param $id * @param string $fileResultType + * @param string $documentFormat * @param null $directoryPath * @param bool $hasFailed */ - public function __construct($id, $fileResultType, $directoryPath = null, $hasFailed = false) + public function __construct($id, string $fileResultType, string $documentFormat, $directoryPath = null, bool $hasFailed = false) { $this->url = env('WEBHOOK_CORE_URL') . '/webhooks'; $this->secret = env('WEBHOOK_CORE_SECRET'); $this->id = $id; - $this->directoryPath = $directoryPath; $this->fileResultType = $fileResultType; + $this->documentFormat = $documentFormat; + $this->directoryPath = $directoryPath; $this->hasFailed = $hasFailed; } @@ -105,6 +108,7 @@ class SendToCore implements ShouldQueue 'id' => $this->id, 'content' => $content, 'file_result_type' => $this->fileResultType, + 'document_format' => $this->documentFormat, 'status' => count($content) > 0 ? 'success' : 'fail', ]]) ->useSecret($this->secret) diff --git a/tests/Feature/ProcessDocxDocumentTest.php b/tests/Feature/ProcessDocxDocumentTest.php index 1fee4f9..07b067c 100644 --- a/tests/Feature/ProcessDocxDocumentTest.php +++ b/tests/Feature/ProcessDocxDocumentTest.php @@ -17,7 +17,8 @@ class ProcessDocxDocumentTest extends TestCase // $reader = new DocxReader($storage, 'contracts/x.docx'); // $reader = new DocxReader($storage, 'contracts/y.docx'); - $reader = new DocxReader($storage, 'contracts/z.docx'); +// $reader = new DocxReader($storage, 'contracts/z.docx'); + $reader = new DocxReader($storage, 'contracts/with-bookmarks.docx'); $result = $reader->execute(); $writer = new DocxWriter($storage, 'contracts/test-write.docx'); @@ -27,7 +28,8 @@ class ProcessDocxDocumentTest extends TestCase /** @test */ public function it_recreates_original_document_from_json() { - $data = Storage::disk('local')->get('contracts/x.json'); +// $data = Storage::disk('local')->get('contracts/x.json'); + $data = Storage::disk('local')->get('contracts/a.json'); $data = json_decode($data, true); $recreateDocument = new RecreateDocument('test123', $data);