Browse Source

WIP Converting tags to Bookmark element in RecreateDocument needs verifying

hidden_tags_with_bookmarks
Orzu Ionut 3 years ago
parent
commit
37f4209d2c
  1. 1
      app/Http/Controllers/IngestController.php
  2. 2
      app/Http/Controllers/RecreateDocumentController.php
  3. 5
      app/Ingest/AbstractConvertor.php
  4. 61
      app/Ingest/DocxConvertor.php
  5. 20
      app/Ingest/DocxReader.php
  6. 4
      app/Jobs/IngestDocuments.php
  7. 256
      app/Jobs/RecreateDocument.php
  8. 8
      app/Jobs/SendToCore.php
  9. 6
      tests/Feature/ProcessDocxDocumentTest.php

1
app/Http/Controllers/IngestController.php

@ -11,6 +11,7 @@ class IngestController extends Controller
request()->validate([
'id' => 'required',
'file_result_type' => 'required|in:md,original',
'mime_type' => 'required',
'document' => 'required|file',
]);

2
app/Http/Controllers/RecreateDocumentController.php

@ -46,7 +46,7 @@ class RecreateDocumentController extends Controller
return '';
}
return response()->download($storage->path($fullPath), 'document.docx', [])
return response()->download($storage->path($fullPath), $filePath, [])
->deleteFileAfterSend(true);
}
}

5
app/Ingest/AbstractConvertor.php

@ -17,6 +17,11 @@ abstract class AbstractConvertor
abstract public function execute();
public function getPath()
{
return $this->path;
}
public function setPath($path)
{
$this->path = $path;

61
app/Ingest/DocxConvertor.php

@ -15,13 +15,11 @@ class DocxConvertor extends AbstractConvertor
{
$this->convertToPdfWithLibreOffice();
$pdfFilePath = "$this->directoryPath/document.pdf";
if ( ! $this->storage->exists($pdfFilePath)) {
throw new \Exception('Failed to convert to PDF: ' . $pdfFilePath);
if ( ! $this->storage->exists($this->path)) {
throw new \Exception('Failed to convert to PDF: ' . $this->path);
}
$convertor = new PDFConvertor($this->storage, $pdfFilePath);
$convertor = new PDFConvertor($this->storage, $this->path);
$convertor->execute();
}
@ -53,20 +51,67 @@ class DocxConvertor extends AbstractConvertor
*
* @throws \Exception
*/
protected function convertToPdfWithLibreOffice()
public function convertToPdfWithLibreOffice()
{
$this->convertToFormat('pdf');
}
/**
*
* @throws \Exception
*/
public function convertToODT()
{
$this->convertToFormat('odt');
}
/**
*
* @throws \Exception
*/
public function convertToRTF()
{
$this->convertToFormat('rtf');
}
/**
*
* @throws \Exception
*/
public function convertToDOC()
{
$this->convertToFormat('doc');
}
/**
*
* @throws \Exception
*/
public function convertToTXT()
{
$this->convertToFormat('txt');
}
/**
*
* @throws \Exception
*/
protected function convertToFormat($format)
{
$office = new Office();
$success = $office->run(
'pdf',
$format,
$this->storage->path($this->path),
$this->storage->path($this->directoryPath)
);
if (! $success) {
throw new \Exception('Failed when converting from DOCX to PDF for file: ' . $this->path);
throw new \Exception('Failed when converting from DOCX to ' . strtoupper($format) . ' for file: ' . $this->path);
}
$this->deleteOriginalDocument();
$this->path = "$this->directoryPath/document.$format";
}
}

20
app/Ingest/DocxReader.php

@ -49,15 +49,21 @@ class DocxReader extends AbstractConvertor
// Converting to HTML and then back to DOCX loses some content and styles (lost when converted to HTML).
$data = [];
// $extension = pathinfo($this->path)['extension'];
// $readerName = array_key_exists($extension, $this->readersMapper)
// ? $this->readersMapper[$extension]
// : 'Word2007';
// $handler = IOFactory::load($this->storage->path($this->path), $readerName);
$handler = IOFactory::load($this->storage->path($this->path));
/**
* @ISSUE
* At the moment of this writing (08/sept/2021) phpword does not support reading bookmarks from the
* DOCX file, in order to add the support we can, for example, add the following lines of
* code to the AbstractPart.php file before the Text and TextRun handling case.
*
* $els = $xmlReader->getElements('w:bookmarkStart', $domNode);
* if ($els && $els->count() > 0) {
* $parent->addBookmark($els[0]->getAttribute('w:name'));
* }
*
*/
$data['default_font_name'] = $handler->getDefaultFontName();
$data['default_font_size'] = $handler->getDefaultFontSize();
$data['styles'] = $this->handleStyles(Style::getStyles());

4
app/Jobs/IngestDocuments.php

@ -76,7 +76,7 @@ class IngestDocuments implements ShouldQueue
$directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
if ($this->fromRequest) {
SendToCore::dispatch($this->id, $this->fileResultType, $directoryPath);
SendToCore::dispatch($this->id, $this->fileResultType, $this->type, $directoryPath);
return;
}
@ -132,7 +132,7 @@ class IngestDocuments implements ShouldQueue
$directoryPath = pathinfo($this->path, PATHINFO_DIRNAME);
if ($this->fromRequest) {
SendToCore::dispatch($this->id, $this->fileResultType, $directoryPath, true);
SendToCore::dispatch($this->id, $this->fileResultType, $this->type, $directoryPath, true);
return;
}

256
app/Jobs/RecreateDocument.php

@ -2,6 +2,7 @@
namespace App\Jobs;
use App\Ingest\DocxConvertor;
use App\Ingest\DocxWriter;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
@ -19,6 +20,7 @@ class RecreateDocument implements ShouldQueue
protected $id;
protected $data;
protected $storage;
protected $path;
protected $url;
protected $secret;
@ -33,6 +35,8 @@ class RecreateDocument implements ShouldQueue
$this->data = $data;
$this->storage = Storage::disk('local');
$this->path = '';
$this->url = env('WEBHOOK_CORE_URL') . '/webhooks';
$this->secret = env('WEBHOOK_CORE_SECRET');
}
@ -49,7 +53,7 @@ class RecreateDocument implements ShouldQueue
$this->createDocx();
// Convert to original format, either PDF, ODT, etc.
$this->convertToOriginalDocumentFormat();
$this->sendResponse('success');
} catch (\Exception $exception) {
@ -66,36 +70,233 @@ class RecreateDocument implements ShouldQueue
$textMapper = [];
foreach ($this->data['contents']['elements'] as $element) {
$textMapper[$element['hash']] = substr(
$textMapper[$element['hash']] =substr(
$text,
$element['range_start'],
$element['range_end'] - $element['range_start'] + 1
);
}
$this->data['elements'] = $this->updateText($this->data['elements'], $textMapper);
$this->data['elements'] = $this->updateText($this->data['elements'], $textMapper)['elements'];
}
protected function updateText($elements, $textMapper)
protected function updateText($elements, $textMapper, $parentElement = null)
{
foreach ($elements as $index => $element) {
$stoppedAtIndex = null;
$collectionToAppend = [];
for ($index = 0; $index < count($elements); $index++) {
$element = $elements[$index];
if (array_key_exists('hash', $element)) {
$elements[$index]['text'] = $textMapper[$element['hash']];
$result = $this->processText($element, $textMapper);
if ( ! $result['is_collection']) {
$elements[$index] = $result['element'];
} else {
if ($parentElement && $parentElement['element_type'] === 'TextRun') {
$stoppedAtIndex = $index;
$collectionToAppend = $result['elements'];
break;
} else {
// We have a collection of elements instead of one, so we must push its siblings to make room.
$numberOfPlacesToMove = count($result['elements']) - 1;
// Move siblings to make room for collection.
for ($i = count($elements) - 1; $i > $index; $i--) {
$elements[$i + $numberOfPlacesToMove] = $elements[$i];
}
foreach ($result['elements'] as $i => $collectionElement) {
$elements[$index + $i] = $collectionElement;
}
$index += $numberOfPlacesToMove;
}
}
}
if (
array_key_exists('text_object', $element) &&
array_key_exists('text', $element['text_object'])
) {
$elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']];
// $elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']];
$result = $this->processText($element['text_object'], $textMapper);
if ( ! $result['is_collection']) {
$elements[$index]['text_object'] = $result['element'];
} else {
// if ($parentElement && $parentElement['element_type'] === 'TextRun') {
// $stoppedAtIndex = $index;
// $collectionToAppend = $result['elements'];
//
// break;
// } else {
// // We have a collection of elements instead of one, so we must push its siblings to make room.
// $numberOfPlacesToMove = count($result['elements']) - 1;
//
// // Move siblings to make room for collection.
// for ($i = count($elements) - 1; $i > $index; $i--) {
// $elements[$i + $numberOfPlacesToMove] = $elements[$i];
// }
//
// foreach ($result['elements'] as $i => $element) {
// $elements[$index + $i] = $element;
// }
//
// $index += $numberOfPlacesToMove;
// }
}
}
if (isset($elements[$index]['elements'])) {
$elements[$index]['elements'] = $this->updateText($elements[$index]['elements'], $textMapper);
$result = $this->updateText($elements[$index]['elements'], $textMapper, $element);
$elements[$index]['elements'] = $result['elements'];
if ($result['has_stopped']) {
// One of the child has become a 'TextRun' and the current $element is also a 'TextRun'
// so the child will become a sibling of the $element.
// Start from the end of the array and move elements.
$numberOfPlacesToMove = count($result['collection_to_append']) + count($result['unprocessed_elements']);
for ($i = count($elements) - 1; $i > $index; $i--) {
$elements[$numberOfPlacesToMove + $i] = $elements[$i];
}
foreach ($result['collection_to_append'] as $i => $collectionElement) {
$elements[$index + 1 + $i] = $collectionElement;
}
// 4 5 6
$elements[$index + 1] = $result['text_run_element'];
foreach ($result['unprocessed_elements'] as $i => $unprocessedElement) {
$elements[$index + count($result['collection_to_append']) + $i] = $unprocessedElement;
}
// Skip 'text_run_element' processing.
$index += 1;
}
}
}
if ($stoppedAtIndex === null) {
return [
'has_stopped' => false,
'elements' => $elements,
];
}
$remainingElements = array_splice($elements, 0, $stoppedAtIndex);
$unprocessedElements = array_splice($elements, 1);
return [
'has_stopped' => true,
'elements' => $remainingElements,
'collection_to_append' => $collectionToAppend,
'unprocessed_elements' => $unprocessedElements,
];
}
protected function processText($element, $textMapper)
{
$text = $textMapper[$element['hash']];
if ($element['text'] === $text) {
return [
'is_collection' => false,
'element' => $element,
];
}
$textWithDisplacement = $text;
preg_match_all('/{[^\/][^{}]*}/', $textWithDisplacement, $accoladeMatches);
// Nothing found.
if (count($accoladeMatches[0]) === 0) {
$element['text'] = $textWithDisplacement;
return [
'is_collection' => false,
'element' => $element,
];
}
// In order to add bookmarks we can create a list of parent element and bookmark element which will be appended
// at the end of the elements list, so it will be created in the net steps.
$elements = [];
$textContents = $textWithDisplacement;
$accoladeMatches = array_unique($accoladeMatches[0]);
foreach ($accoladeMatches as $accoladeMatch) {
$textMatch = substr($accoladeMatch, 1, strlen($accoladeMatch) - 2);
$pattern = '/(' . $accoladeMatch . ')([^{}\/]+)({\/' . $textMatch . '})/';
preg_match_all($pattern, $textWithDisplacement, $matches);
if (($matchesCount = count($matches[0])) > 0) {
for ($i = 0; $i < $matchesCount; $i++) {
$foundText = $matches[0][$i];
$displacement = $matches[1][$i];
$textWithoutDisplacement = trim($matches[2][$i]);
$index = strpos($textContents, $foundText);
$plainText = substr($textContents, 0, $index);
if ($plainText) {
$elements[] = [
'element_type' => 'Text',
'text' => $plainText,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
];
}
$elements[] = [
'element_type' => 'TextRun',
'paragraph_style' => $element['paragraph_style'],
'elements' => [
[
'element_type' => 'Text',
'text' => $textWithoutDisplacement,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
],
[
'element_type' => 'Bookmark',
'name' => substr($displacement, 1, strlen($displacement) - 2),
]
],
];
$textContents = substr($textContents, $index + strlen($foundText));
}
}
}
return $elements;
if ($textContents) {
$elements[] = [
'element_type' => 'Text',
'text' => $textContents,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
];
}
return [
'is_collection' => true,
'elements' => $elements,
];
}
protected function createDocx()
@ -105,6 +306,42 @@ class RecreateDocument implements ShouldQueue
$writer = new DocxWriter($this->storage, $path);
$writer->execute($this->data);
$this->path = $path;
}
/**
* @throws \Exception
*/
protected function convertToOriginalDocumentFormat()
{
if ($this->data['document_format'] === 'docx') {
return;
}
$convertor = new DocxConvertor($this->storage, $this->path);
if ($this->data['document_format'] === 'pdf') {
$convertor->convertToPdfWithLibreOffice();
}
if ($this->data['document_format'] === 'odt') {
$convertor->convertToODT();
}
if ($this->data['document_format'] === 'rtf') {
$convertor->convertToRTF();
}
if ($this->data['document_format'] === 'doc') {
$convertor->convertToDOC();
}
if ($this->data['document_format'] === 'txt') {
$convertor->convertToTXT();
}
$this->path = $convertor->getPath();
}
protected function sendResponse($status)
@ -116,6 +353,7 @@ class RecreateDocument implements ShouldQueue
'id' => $this->id,
'content' => '',
'file_result_type' => 'document-recreated',
'document_format' => $this->data['document_format'],
'status' => $status,
]])
->useSecret($this->secret)

8
app/Jobs/SendToCore.php

@ -18,6 +18,7 @@ class SendToCore implements ShouldQueue
protected $secret;
protected $directoryPath;
protected $fileResultType;
protected $documentFormat;
protected $id;
protected $hasFailed;
@ -31,17 +32,19 @@ class SendToCore implements ShouldQueue
*
* @param $id
* @param string $fileResultType
* @param string $documentFormat
* @param null $directoryPath
* @param bool $hasFailed
*/
public function __construct($id, $fileResultType, $directoryPath = null, $hasFailed = false)
public function __construct($id, string $fileResultType, string $documentFormat, $directoryPath = null, bool $hasFailed = false)
{
$this->url = env('WEBHOOK_CORE_URL') . '/webhooks';
$this->secret = env('WEBHOOK_CORE_SECRET');
$this->id = $id;
$this->directoryPath = $directoryPath;
$this->fileResultType = $fileResultType;
$this->documentFormat = $documentFormat;
$this->directoryPath = $directoryPath;
$this->hasFailed = $hasFailed;
}
@ -105,6 +108,7 @@ class SendToCore implements ShouldQueue
'id' => $this->id,
'content' => $content,
'file_result_type' => $this->fileResultType,
'document_format' => $this->documentFormat,
'status' => count($content) > 0 ? 'success' : 'fail',
]])
->useSecret($this->secret)

6
tests/Feature/ProcessDocxDocumentTest.php

@ -17,7 +17,8 @@ class ProcessDocxDocumentTest extends TestCase
// $reader = new DocxReader($storage, 'contracts/x.docx');
// $reader = new DocxReader($storage, 'contracts/y.docx');
$reader = new DocxReader($storage, 'contracts/z.docx');
// $reader = new DocxReader($storage, 'contracts/z.docx');
$reader = new DocxReader($storage, 'contracts/with-bookmarks.docx');
$result = $reader->execute();
$writer = new DocxWriter($storage, 'contracts/test-write.docx');
@ -27,7 +28,8 @@ class ProcessDocxDocumentTest extends TestCase
/** @test */
public function it_recreates_original_document_from_json()
{
$data = Storage::disk('local')->get('contracts/x.json');
// $data = Storage::disk('local')->get('contracts/x.json');
$data = Storage::disk('local')->get('contracts/a.json');
$data = json_decode($data, true);
$recreateDocument = new RecreateDocument('test123', $data);

Loading…
Cancel
Save