Browse Source

S&D on original document recreates the original document format. Fix PDF issue.

dev
Orzu Ionut 3 years ago
parent
commit
59edf696ad
  1. 4
      app/Ingest/DataJsonConvertor.php
  2. 14
      app/Ingest/Office.php
  3. 211
      app/Jobs/RecreateDocument.php

4
app/Ingest/DataJsonConvertor.php

@ -67,6 +67,10 @@ class DataJsonConvertor extends AbstractConvertor
$this->setPath(str_replace($this->type, 'docx', $this->path));
if ( ! $this->storage->exists($this->path)) {
throw new \Exception('Failed when converting from ' . $this->type . ' to DOCX for file: ' . $this->path . '. The DOCX file doesnt exist.');
}
$this->type = 'docx';
}
}

14
app/Ingest/Office.php

@ -31,8 +31,15 @@ class Office
protected function runConversion($convertTo, $filePath, $directoryPath)
{
$pdfAdditionalOption = '';
if ($this->fileIsPDF($filePath)) {
$pdfAdditionalOption = "--infilter='writer_pdf_import'";
}
$process = new Process([
'soffice',
$pdfAdditionalOption,
'--accept="pipe,name=soffice-pipe-' . $this->id . ';urp;StarOffice.ServiceMananger"',
'-env:UserInstallation=file:///tmp/' . $this->directory,
'--headless',
@ -50,6 +57,13 @@ class Office
return $process->isSuccessful();
}
protected function fileIsPDF($filePath)
{
$s = '.pdf';
return substr($filePath, - strlen($s)) === $s;
}
protected function makeTemporaryDirectory()
{
(new Process(['mkdir /tmp/' . $this->directory]))->run();

211
app/Jobs/RecreateDocument.php

@ -77,226 +77,29 @@ class RecreateDocument implements ShouldQueue
);
}
$this->data['elements'] = $this->updateText($this->data['elements'], $textMapper)['elements'];
$this->data['elements'] = $this->updateText($this->data['elements'], $textMapper);
}
protected function updateText($elements, $textMapper, $parentElement = null)
protected function updateText($elements, $textMapper)
{
$stoppedAtIndex = null;
$collectionToAppend = [];
for ($index = 0; $index < count($elements); $index++) {
$element = $elements[$index];
foreach ($elements as $index => $element) {
if (array_key_exists('hash', $element)) {
$result = $this->processText($element, $textMapper);
if ( ! $result['is_collection']) {
$elements[$index] = $result['element'];
} else {
if ($parentElement && $parentElement['element_type'] === 'TextRun') {
$stoppedAtIndex = $index;
$collectionToAppend = $result['elements'];
break;
} else {
// We have a collection of elements instead of one, so we must push its siblings to make room.
$numberOfPlacesToMove = count($result['elements']) - 1;
// Move siblings to make room for collection.
for ($i = count($elements) - 1; $i > $index; $i--) {
$elements[$i + $numberOfPlacesToMove] = $elements[$i];
}
foreach ($result['elements'] as $i => $collectionElement) {
$elements[$index + $i] = $collectionElement;
}
$index += $numberOfPlacesToMove;
}
}
$elements[$index]['text'] = $textMapper[$element['hash']];
}
if (
array_key_exists('text_object', $element) &&
array_key_exists('text', $element['text_object'])
) {
// $elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']];
$result = $this->processText($element['text_object'], $textMapper);
if ( ! $result['is_collection']) {
$elements[$index]['text_object'] = $result['element'];
} else {
// if ($parentElement && $parentElement['element_type'] === 'TextRun') {
// $stoppedAtIndex = $index;
// $collectionToAppend = $result['elements'];
//
// break;
// } else {
// // We have a collection of elements instead of one, so we must push its siblings to make room.
// $numberOfPlacesToMove = count($result['elements']) - 1;
//
// // Move siblings to make room for collection.
// for ($i = count($elements) - 1; $i > $index; $i--) {
// $elements[$i + $numberOfPlacesToMove] = $elements[$i];
// }
//
// foreach ($result['elements'] as $i => $element) {
// $elements[$index + $i] = $element;
// }
//
// $index += $numberOfPlacesToMove;
// }
}
$elements[$index]['text_object']['text'] = $textMapper[$element['text_object']['hash']];
}
if (isset($elements[$index]['elements'])) {
$result = $this->updateText($elements[$index]['elements'], $textMapper, $element);
$elements[$index]['elements'] = $result['elements'];
if ($result['has_stopped']) {
// One of the child has become a 'TextRun' and the current $element is also a 'TextRun'
// so the child will become a sibling of the $element.
// Start from the end of the array and move elements.
$numberOfPlacesToMove = count($result['collection_to_append']) + count($result['unprocessed_elements']);
for ($i = count($elements) - 1; $i > $index; $i--) {
$elements[$numberOfPlacesToMove + $i] = $elements[$i];
}
foreach ($result['collection_to_append'] as $i => $collectionElement) {
$elements[$index + 1 + $i] = $collectionElement;
}
// 4 5 6
$elements[$index + 1] = $result['text_run_element'];
foreach ($result['unprocessed_elements'] as $i => $unprocessedElement) {
$elements[$index + count($result['collection_to_append']) + $i] = $unprocessedElement;
}
// Skip 'text_run_element' processing.
$index += 1;
}
}
}
if ($stoppedAtIndex === null) {
return [
'has_stopped' => false,
'elements' => $elements,
];
}
$remainingElements = array_splice($elements, 0, $stoppedAtIndex);
$unprocessedElements = array_splice($elements, 1);
return [
'has_stopped' => true,
'elements' => $remainingElements,
'collection_to_append' => $collectionToAppend,
'unprocessed_elements' => $unprocessedElements,
];
}
protected function processText($element, $textMapper)
{
$text = $textMapper[$element['hash']];
if ($element['text'] === $text) {
return [
'is_collection' => false,
'element' => $element,
];
}
$textWithDisplacement = $text;
preg_match_all('/{[^\/][^{}]*}/', $textWithDisplacement, $accoladeMatches);
// Nothing found.
if (count($accoladeMatches[0]) === 0) {
$element['text'] = $textWithDisplacement;
return [
'is_collection' => false,
'element' => $element,
];
}
// In order to add bookmarks we can create a list of parent element and bookmark element which will be appended
// at the end of the elements list, so it will be created in the net steps.
$elements = [];
$textContents = $textWithDisplacement;
$accoladeMatches = array_unique($accoladeMatches[0]);
foreach ($accoladeMatches as $accoladeMatch) {
$textMatch = substr($accoladeMatch, 1, strlen($accoladeMatch) - 2);
$pattern = '/(' . $accoladeMatch . ')([^{}\/]+)({\/' . $textMatch . '})/';
preg_match_all($pattern, $textWithDisplacement, $matches);
if (($matchesCount = count($matches[0])) > 0) {
for ($i = 0; $i < $matchesCount; $i++) {
$foundText = $matches[0][$i];
$displacement = $matches[1][$i];
$textWithoutDisplacement = trim($matches[2][$i]);
$index = strpos($textContents, $foundText);
$plainText = substr($textContents, 0, $index);
if ($plainText) {
$elements[] = [
'element_type' => 'Text',
'text' => $plainText,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
];
}
$elements[] = [
'element_type' => 'TextRun',
'paragraph_style' => $element['paragraph_style'],
'elements' => [
[
'element_type' => 'Text',
'text' => $textWithoutDisplacement,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
],
[
'element_type' => 'Bookmark',
'name' => substr($displacement, 1, strlen($displacement) - 2),
]
],
];
$textContents = substr($textContents, $index + strlen($foundText));
}
$elements[$index]['elements'] = $this->updateText($elements[$index]['elements'], $textMapper);
}
}
if ($textContents) {
$elements[] = [
'element_type' => 'Text',
'text' => $textContents,
'font_style' => $element['font_style'],
'paragraph_style' => $element['paragraph_style'],
];
}
return [
'is_collection' => true,
'elements' => $elements,
];
return $elements;
}
protected function createDocx()

Loading…
Cancel
Save