Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

369 lines
12 KiB

<?php
namespace App\Jobs;
use App\Ingest\DocxConvertor;
use App\Ingest\DocxWriter;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use Spatie\WebhookServer\WebhookCall;
class RecreateDocument implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;
protected $id;
protected $data;
protected $storage;
protected $path;
protected $url;
protected $secret;
/**
* Create a new job instance.
*
* @return void
*/
public function __construct($id, $data)
{
$this->id = $id;
$this->data = $data;
$this->storage = Storage::disk('local');
$this->path = '';
$this->url = env('WEBHOOK_CORE_URL') . '/webhooks';
$this->secret = env('WEBHOOK_CORE_SECRET');
}
/**
* Execute the job.
*
* @return void
*/
public function handle()
{
try {
$this->setupData();
$this->createDocx();
$this->convertToOriginalDocumentFormat();
$this->sendResponse('success');
} catch (\Exception $exception) {
\Illuminate\Support\Facades\Log::info('RecreateDocument@handle: ' . $exception->getMessage());
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
$this->sendResponse('fail');
}
}
protected function setupData()
{
$text = $this->data['contents']['text'];
$textMapper = [];
foreach ($this->data['contents']['elements'] as $element) {
$textMapper[$element['hash']] =substr(
$text,
$element['range_start'],
$element['range_end'] - $element['range_start'] + 1
);
}
$this->data['elements'] = $this->updateText($this->data['elements'], $textMapper)['elements'];
}
protected function updateText($elements, $textMapper, $parentElement = null)
{
$stoppedAtIndex = null;
$collectionToAppend = [];
for ($index = 0; $index < count($elements); $index++) {
$element = $elements[$index];
if (array_key_exists('hash', $element)) {
$result = $this->processText($element, $textMapper);
if ( ! $result['is_collection']) {
$elements[$index] = $result['element'];
} else {
if ($parentElement && $parentElement['element_type'] === 'TextRun') {
$stoppedAtIndex = $index;
$collectionToAppend = $result['elements'];
break;
} else {
// We have a collection of elements instead of one, so we must push its siblings to make room.
$numberOfPlacesToMove = count($result['elements']) - 1;
// Move siblings to make room for collection.
for ($i = count($elements) - 1; $i > $index; $i--) {
$elements[$i + $numberOfPlacesToMove] = $elements[$i];
}
foreach ($result['elements'] as $i => $collectionElement) {
$elements[$index + $i] = $collectionElement;
}
$index += $numberOfPlacesToMove;
}
}
}
if (
array_key_exists('text_object', $element) &&
array_key_exists('text', $element['text_object'])
) {
// $elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']];
$result = $this->processText($element['text_object'], $textMapper);
if ( ! $result['is_collection']) {
$elements[$index]['text_object'] = $result['element'];
} else {
// if ($parentElement && $parentElement['element_type'] === 'TextRun') {
// $stoppedAtIndex = $index;
// $collectionToAppend = $result['elements'];
//
// break;
// } else {
// // We have a collection of elements instead of one, so we must push its siblings to make room.
// $numberOfPlacesToMove = count($result['elements']) - 1;
//
// // Move siblings to make room for collection.
// for ($i = count($elements) - 1; $i > $index; $i--) {
// $elements[$i + $numberOfPlacesToMove] = $elements[$i];
// }
//
// foreach ($result['elements'] as $i => $element) {
// $elements[$index + $i] = $element;
// }
//
// $index += $numberOfPlacesToMove;
// }
}
}
if (isset($elements[$index]['elements'])) {
$result = $this->updateText($elements[$index]['elements'], $textMapper, $element);
$elements[$index]['elements'] = $result['elements'];
if ($result['has_stopped']) {
// One of the child has become a 'TextRun' and the current $element is also a 'TextRun'
// so the child will become a sibling of the $element.
// Start from the end of the array and move elements.
$numberOfPlacesToMove = count($result['collection_to_append']) + count($result['unprocessed_elements']);
for ($i = count($elements) - 1; $i > $index; $i--) {
$elements[$numberOfPlacesToMove + $i] = $elements[$i];
}
foreach ($result['collection_to_append'] as $i => $collectionElement) {
$elements[$index + 1 + $i] = $collectionElement;
}
// 4 5 6
$elements[$index + 1] = $result['text_run_element'];
foreach ($result['unprocessed_elements'] as $i => $unprocessedElement) {
$elements[$index + count($result['collection_to_append']) + $i] = $unprocessedElement;
}
// Skip 'text_run_element' processing.
$index += 1;
}
}
}
if ($stoppedAtIndex === null) {
return [
'has_stopped' => false,
'elements' => $elements,
];
}
$remainingElements = array_splice($elements, 0, $stoppedAtIndex);
$unprocessedElements = array_splice($elements, 1);
return [
'has_stopped' => true,
'elements' => $remainingElements,
'collection_to_append' => $collectionToAppend,
'unprocessed_elements' => $unprocessedElements,
];
}
protected function processText($element, $textMapper)
{
$text = $textMapper[$element['hash']];
if ($element['text'] === $text) {
return [
'is_collection' => false,
'element' => $element,
];
}
$textWithDisplacement = $text;
preg_match_all('/{[^\/][^{}]*}/', $textWithDisplacement, $accoladeMatches);
// Nothing found.
if (count($accoladeMatches[0]) === 0) {
$element['text'] = $textWithDisplacement;
return [
'is_collection' => false,
'element' => $element,
];
}
// In order to add bookmarks we can create a list of parent element and bookmark element which will be appended
// at the end of the elements list, so it will be created in the net steps.
$elements = [];
$textContents = $textWithDisplacement;
$accoladeMatches = array_unique($accoladeMatches[0]);
foreach ($accoladeMatches as $accoladeMatch) {
$textMatch = substr($accoladeMatch, 1, strlen($accoladeMatch) - 2);
$pattern = '/(' . $accoladeMatch . ')([^{}\/]+)({\/' . $textMatch . '})/';
preg_match_all($pattern, $textWithDisplacement, $matches);
if (($matchesCount = count($matches[0])) > 0) {
for ($i = 0; $i < $matchesCount; $i++) {
$foundText = $matches[0][$i];
$displacement = $matches[1][$i];
$textWithoutDisplacement = trim($matches[2][$i]);
$index = strpos($textContents, $foundText);
$plainText = substr($textContents, 0, $index);
if ($plainText) {
$elements[] = [
'element_type' => 'Text',
'text' => $plainText,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
];
}
$elements[] = [
'element_type' => 'TextRun',
'paragraph_style' => $element['paragraph_style'],
'elements' => [
[
'element_type' => 'Text',
'text' => $textWithoutDisplacement,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
],
[
'element_type' => 'Bookmark',
'name' => substr($displacement, 1, strlen($displacement) - 2),
]
],
];
$textContents = substr($textContents, $index + strlen($foundText));
}
}
}
if ($textContents) {
$elements[] = [
'element_type' => 'Text',
'text' => $textContents,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
];
}
return [
'is_collection' => true,
'elements' => $elements,
];
}
protected function createDocx()
{
$path = 'contracts/' . $this->id . '-document.docx';
$writer = new DocxWriter($this->storage, $path);
$writer->execute($this->data);
$this->path = $path;
}
/**
* @throws \Exception
*/
protected function convertToOriginalDocumentFormat()
{
if ($this->data['document_format'] === 'docx') {
return;
}
$convertor = new DocxConvertor($this->storage, $this->path);
if ($this->data['document_format'] === 'pdf') {
$convertor->convertToPdfWithLibreOffice();
}
if ($this->data['document_format'] === 'odt') {
$convertor->convertToODT();
}
if ($this->data['document_format'] === 'rtf') {
$convertor->convertToRTF();
}
if ($this->data['document_format'] === 'doc') {
$convertor->convertToDOC();
}
if ($this->data['document_format'] === 'txt') {
$convertor->convertToTXT();
}
$this->path = $convertor->getPath();
}
protected function sendResponse($status)
{
try {
WebhookCall::create()
->url($this->url)
->payload(['data' => [
'id' => $this->id,
'content' => '',
'file_result_type' => 'document-recreated',
'document_format' => $this->data['document_format'],
'status' => $status,
]])
->useSecret($this->secret)
->dispatch();
return true;
} catch (\Exception $exception) {
Log::error('RecreateDocument@sendDocument: ' . $exception->getMessage());
return false;
}
}
}