From b7ebd686df8a2b617632bc0080300af149aaf899 Mon Sep 17 00:00:00 2001 From: Radu Liviu Carjan Date: Fri, 28 Oct 2022 13:18:40 +0300 Subject: [PATCH] Minor fix to OCR files --- app/SearchDisplace/Convertor/Convertor.php | 12 ++++++++++ .../Ingest/HandleReceivedDocument.php | 14 +++++++++++- .../SearchAndDisplaceFromFiles.php | 22 +++++++++++-------- 3 files changed, 38 insertions(+), 10 deletions(-) diff --git a/app/SearchDisplace/Convertor/Convertor.php b/app/SearchDisplace/Convertor/Convertor.php index 7284861..4be7bc9 100644 --- a/app/SearchDisplace/Convertor/Convertor.php +++ b/app/SearchDisplace/Convertor/Convertor.php @@ -4,6 +4,7 @@ namespace App\SearchDisplace\Convertor; use Symfony\Component\Process\Process; use Symfony\Component\Process\Exception\ProcessFailedException; +use Illuminate\Support\Facades\Log; /** * Convert documents from formats supported by Libre Office @@ -28,6 +29,17 @@ class Convertor { $folder = storage_path('app/tmp/'); } + Log::info('Running `soffice` to convert "' . $original . '" to "' . $to . '". Output folder: "' . $folder . '"'); + Log::info( + 'COMMAND: ' . + 'soffice ' . + '--convert-to ' . + $to . ' ' . + $original . ' ' . + '--outdir ' . + $folder + ); + $process = new Process( [ 'soffice', diff --git a/app/SearchDisplace/Ingest/HandleReceivedDocument.php b/app/SearchDisplace/Ingest/HandleReceivedDocument.php index fe92cea..3ce8630 100644 --- a/app/SearchDisplace/Ingest/HandleReceivedDocument.php +++ b/app/SearchDisplace/Ingest/HandleReceivedDocument.php @@ -9,6 +9,8 @@ use GuzzleHttp\Client; use GuzzleHttp\Exception\ClientException; use Illuminate\Support\Facades\Log; use Illuminate\Support\Facades\Storage; +use \Illuminate\Contracts\Filesystem\Filesystem; +use App\SearchDisplace\Convertor\Convertor; class HandleReceivedDocument { @@ -17,6 +19,7 @@ class HandleReceivedDocument protected $fileResultType; protected $documentFormat; protected $status; + protected Filesystem $storage; public function __construct($payload) { @@ -28,6 +31,8 @@ class HandleReceivedDocument if (isset($payload['data']['document_format'])) { $this->documentFormat = $payload['data']['document_format']; } + + $this->storage = Storage::disk('local'); } /** @@ -58,6 +63,7 @@ class HandleReceivedDocument // The .md extension signals the success status, the lack of signals the fail status. if ($this->status === 'success') { + $intermediateFileName = $fileName . '.odt'; $fileName = $fileName . '.html'; } @@ -68,6 +74,11 @@ class HandleReceivedDocument $storage->put("$dir/$fileName", $this->content['document']); + // HTML document cannot be converted directly to xml. So we convert to ODT first, then to XML + Convertor::convert('odt', $this->storage->path("$dir/$fileName")); + Convertor::convert('xml', $this->storage->path("$dir/$intermediateFileName")); + $this->storage->delete("$dir/$intermediateFileName"); + foreach ($this->content['images'] as $image) { $name = $image['name']; $type = $image['type']; @@ -81,7 +92,8 @@ class HandleReceivedDocument // Emit event so other sections of the app can work on it. IngestDocumentReceived::dispatch($this->id); } catch (\Exception $exception) { - \Illuminate\Support\Facades\Log::info('Exception: ' . $exception->getTraceAsString()); + Log::error('Exception: ' . $exception->getMessage()); + Log::error($exception->getTraceAsString()); } } diff --git a/app/SearchDisplace/SearchAndDisplaceFromFiles.php b/app/SearchDisplace/SearchAndDisplaceFromFiles.php index ceb006d..34b8ee9 100644 --- a/app/SearchDisplace/SearchAndDisplaceFromFiles.php +++ b/app/SearchDisplace/SearchAndDisplaceFromFiles.php @@ -30,7 +30,7 @@ class SearchAndDisplaceFromFiles } try { - $documentContent = $this->storage->get("$this->directoryPath/document.md"); + $documentContent = $this->storage->get("$this->directoryPath/document.html"); $searchersContent = json_decode($this->storage->get($this->infoFilePath), true); $documentPath = $searchersContent['document_path']; @@ -38,15 +38,19 @@ class SearchAndDisplaceFromFiles $this->storage->put($this->infoFilePath, json_encode($searchers[0]['content'])); - $searchAndDisplace = new SearchAndDisplace($documentContent, [ - 'searchers' => [ - [ - 'key' => $this->id, - 'type' => $searchers[0]['type'], - 'value' => $searchers[0]['value'], - ] + $searchAndDisplace = new SearchAndDisplace( + $documentContent, + [ + 'searchers' => [ + [ + 'key' => $this->id, + 'type' => $searchers[0]['type'], + 'value' => $searchers[0]['value'], + ] + ], ], - ]); + false + ); $result = $searchAndDisplace->execute();