From d8ac0c8433bdeb36b177444dcc192d14ed7c0ad9 Mon Sep 17 00:00:00 2001 From: Radu Liviu Carjan Date: Mon, 31 Oct 2022 17:46:01 +0200 Subject: [PATCH] Fixes to the search & displace functionality --- .env.example | 3 + app/SearchDisplace/Convertor/Convertor.php | 22 ++---- app/SearchDisplace/SearchAndDisplaceXML.php | 79 ++++++++------------- 3 files changed, 36 insertions(+), 68 deletions(-) diff --git a/.env.example b/.env.example index 371aac6..460f815 100644 --- a/.env.example +++ b/.env.example @@ -23,3 +23,6 @@ SD_DUCKLING_URL=http://0.0.0.0:8000/parse SD_INGEST_URL=http://localhost/ingest WEBHOOK_CLIENT_SECRET=A5qayc2O53Vslw + +# The config path is relative to the storage path +LIBREOFFICE_CONFIG_PATH=tmp/libreoffice \ No newline at end of file diff --git a/app/SearchDisplace/Convertor/Convertor.php b/app/SearchDisplace/Convertor/Convertor.php index f6501f6..863bd13 100644 --- a/app/SearchDisplace/Convertor/Convertor.php +++ b/app/SearchDisplace/Convertor/Convertor.php @@ -5,6 +5,7 @@ namespace App\SearchDisplace\Convertor; use Symfony\Component\Process\Process; use Symfony\Component\Process\Exception\ProcessFailedException; use Illuminate\Support\Facades\Log; +use Illuminate\Support\Facades\Storage; /** * Convert documents from formats supported by Libre Office @@ -31,7 +32,7 @@ class Convertor { } $env = [ - 'HOME' => storage_path('app/tmp/'), + 'HOME' => storage_path('app/' . env('LIBREOFFICE_CONFIG_PATH', 'tmp/libreoffice')), ]; if ($extension == 'odt') { @@ -50,21 +51,6 @@ class Convertor { $folder ); - - // $process = new Process( - // [ - // 'soffice', - // '--convert-to', - // $to, - // $original, - // '--outdir', - // $folder - // ], base_path(), - // [ - // 'HOME' => base_path(), - // 'FILTER' => 'OpenDocument Text Flat XML' - // ] - // ); # We will run the process from a shell command line, which allows us to add parameters # The "OpenDocument Text Flat XML" parameter contains whitespaces, so we will need to add that as # a env variable parameter, otherwise the Process class will escape it and it will not work properly. @@ -76,8 +62,6 @@ class Convertor { $process->run(function ($type, $buffer) { if (Process::ERR === $type) { Log::info("CONVERT ERROR: " . $buffer); - } else { - // Log::info("CONVERT OUTPUT: " . $buffer); } }, $env); @@ -85,6 +69,8 @@ class Convertor { throw new ProcessFailedException($process); } + Storage::deleteDirectory(env('LIBREOFFICE_CONFIG_PATH', 'app/tmp/libreoffice')); + return $path['filename'] . '.' . $to; } } \ No newline at end of file diff --git a/app/SearchDisplace/SearchAndDisplaceXML.php b/app/SearchDisplace/SearchAndDisplaceXML.php index 84165e7..0409a7a 100644 --- a/app/SearchDisplace/SearchAndDisplaceXML.php +++ b/app/SearchDisplace/SearchAndDisplaceXML.php @@ -60,27 +60,8 @@ class SearchAndDisplaceXML $dom->load($filePath . "/document.xml"); // foreach($dom->getElementsByTagName('p') as $p) { - foreach($dom->getElementsByTagName('body') as $p) { - // if( - // !$p instanceof DOMText && - // count($p->childNodes) > 0 && - // isset($p->parentNode->tagName) && - // $p->parentNode->tagName !== "table:table-cell" - // ) { - // $replacements = []; - // foreach($p->childNodes as $child) { - // if (in_array($child, $replacements)) { - // continue; - // } - - // if (!$child instanceof DOMText) { - // continue; - // } - - // $replacements = array_merge($replacements, $this->replace($child, $dom)); - // } - // } - $this->processElement($p, $dom); + foreach($dom->getElementsByTagName('body') as $body) { + $this->processElement($body, $dom); } $dom->save($filePath . "/document_sdapplied.xml"); @@ -126,7 +107,7 @@ class SearchAndDisplaceXML /** * Apply SD on document's paragraph * - * @param DOMNode $element DOM element + * @param DOMText $element DOM element * @param DOMDocument $dom The document * * @return array @@ -154,40 +135,38 @@ class SearchAndDisplaceXML $content = $element->textContent; $indexes = $changed; } else { - $content = $changed['content']; + $element->textContent = $content = $changed['content']; $indexes = $changed['indexes']; } - foreach($indexes as $searcher => $changes) { - if(empty($changes)) { - continue; - } - - foreach($changes as $change) { - $firstContent = substr($content, 0, $change['start']); - $changedContent = substr($content, $change['start'], $change['end'] - $change['start'] + 1); - $lastContent = substr($content, $change['end'] + 1); - - - // $firstNode = $dom->createElement("text:span", $firstContent); - $element->textContent = htmlspecialchars($firstContent); + # The changed indexes are filed into arrays based on which searcher they belong. + # This doesn't concern us here. Merge all the changed indexes into a single array, so we can sort them + $indexes = array_merge( ...array_values($indexes)); # Unpack and merge the arrays into a single array - $changedNode = $dom->createElement("text:span", htmlspecialchars($changedContent)); - $changedNode->setAttribute('text:style-name', 'mark'); + # Sort all the indexes in descending order + usort($indexes, function($first, $second) { + return $second['start'] - $first['start']; + }); - $lastNode = $dom->createElement("text:span", htmlspecialchars($lastContent)); - - // Add the changed and last nodes after the current (element) node - // $element->parentNode->insertBefore($firstNode, $element->nextSibling); - - # element->parentNode->insertBefore(... $element->nextSibling) inserts a new node before the node AFTER this one - # So we need to add the `last` node first, and then the `changed` node BEFORE the last. - $element->parentNode->insertBefore($lastNode, $element->nextSibling); - $element->parentNode->insertBefore($changedNode, $element->nextSibling); + foreach($indexes as $index) { + $content = $element->textContent; - $replacementNodes[] = $changedNode; - $replacementNodes[] = $lastNode; - } + # Split the element at the specified indexes + # (end needs to b e + 1, since end is where the changed text finishes), + # so we need to split the text one character after it ends + $lastNode = $element->splitText($index['end'] + 1); + $changedNode = $element->splitText($index['start']); + + # DOMText::splitText creates DOMText nodes. + # The changed node needs to be a DOMElement so we can add the mark style + # Create a element with the same content, then replace the node. + $changedNodeElement = $dom->createElement("text:span", $changedNode->textContent); + $changedNodeElement->setAttribute('text:style-name', 'mark'); + $changedNode->parentNode->replaceChild($changedNodeElement, $changedNode); + $changedNode = $changedNodeElement; + + $replacementNodes[] = $changedNode; + $replacementNodes[] = $lastNode; } if(!$this->markedStyleCreated) {