From fc833a9ab763540b27722e22ef8d5874040f6ba0 Mon Sep 17 00:00:00 2001 From: Orzu Ionut Date: Wed, 16 Jun 2021 12:54:03 +0300 Subject: [PATCH] PDFs with images --- README.md | 3 + app/Ingest/AbstractConvertor.php | 24 ++ app/Ingest/Convertor.php | 434 +------------------------------ app/Ingest/DocumentHandler.php | 6 +- app/Ingest/DocxConvertor.php | 46 ++++ app/Ingest/OtherConvertor.php | 50 ++++ app/Ingest/PDFConvertor.php | 271 +++++++++++++++++++ app/Ingest/TextConvertor.php | 52 ++++ app/Jobs/IngestDocuments.php | 103 +------- app/Jobs/SendToCore.php | 74 ++++-- 10 files changed, 516 insertions(+), 547 deletions(-) create mode 100644 app/Ingest/AbstractConvertor.php create mode 100644 app/Ingest/DocxConvertor.php create mode 100644 app/Ingest/OtherConvertor.php create mode 100644 app/Ingest/PDFConvertor.php create mode 100644 app/Ingest/TextConvertor.php diff --git a/README.md b/README.md index 5d370b7..55d694f 100644 --- a/README.md +++ b/README.md @@ -70,6 +70,9 @@ cd Bin # Dewarp pip3 install opencv-python +cd DEWARP_INSTALLATION_DIRECTORY +pip3 install -r requirements.txt + # MAT2 (Metadata remover) - Not used at the moment pip3 install mat2 apt-get install gir1.2-poppler-0.18 diff --git a/app/Ingest/AbstractConvertor.php b/app/Ingest/AbstractConvertor.php new file mode 100644 index 0000000..bd06f20 --- /dev/null +++ b/app/Ingest/AbstractConvertor.php @@ -0,0 +1,24 @@ +storage = $storage; + $this->path = $path; + $this->directoryPath = pathinfo($path, PATHINFO_DIRNAME); + } + + abstract public function execute(); + + protected function deleteOriginalDocument() + { + $this->storage->delete($this->path); + } +} diff --git a/app/Ingest/Convertor.php b/app/Ingest/Convertor.php index d6bbce7..8a204af 100644 --- a/app/Ingest/Convertor.php +++ b/app/Ingest/Convertor.php @@ -5,7 +5,6 @@ namespace App\Ingest; use Illuminate\Support\Facades\Storage; use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Process; -use League\HTMLToMarkdown\HtmlConverter; class Convertor { @@ -30,191 +29,18 @@ class Convertor public function execute() { if ($this->type === 'txt') { - return $this->path; + $convertor = new TextConvertor($this->storage, $this->path); + } else if ($this->type === 'pdf') { + $convertor = new PDFConvertor($this->storage, $this->path); + } else if ($this->type === 'docx') { + $convertor = new DocxConvertor($this->storage, $this->path); + } else { + $convertor = new OtherConvertor($this->storage, $this->path); } - if ($this->type === 'pdf') { -// $this->convertPdfToText(); - $this->convertPdfToMD(); -// $this->getHtmlContentsFromPdfWithImages(); + $convertor->execute(); - return $this->path; - } - - if ($this->type !== 'docx') { - $this->convertToDocx(); - } - - $this->convertDocumentToText(); //$this->convertToHtml(); - - return $this->path; - } - - /** - * Convert doc,dot,rtf,odt,pdf,docx to docx - * - * - * @return string|void - */ - private function convertToDocx() - { - (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); - - /** - * Convert doc,dot,rtf,odt to docx - */ - $process = new Process([ - 'soffice', - '--headless', - '--convert-to', - 'docx', - $this->storage->path($this->path), - '--outdir', - $this->storage->path('contracts') - ]); - - $process->run(); - - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); - } - - $this->storage->delete($this->path); - - $this->path = str_replace(".$this->type", '.docx', $this->path); - } - - /** - * Convert docx file to text - * - * @return void - */ - private function convertDocumentToText() - { - (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); - - $process = new Process([ - 'soffice', - '--headless', - '--convert-to', - 'txt', - $this->storage->path($this->path), - '--outdir', - $this->storage->path('contracts') - ]); - - $process->run(); - - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); - } - - $this->storage->delete($this->path); - - $this->path = str_replace(['.docx', '.bin'], '.txt', $this->path); - } - - protected function convertPdfToText() - { - $this->prepareForConvertPDF(); - - $images = $this->getImagesFromPDF(); - - $contents = $this->getTextContentsFromPDF(); - - if (!$contents && count($images) === 0) { - throw new \Exception('Could not read from file.'); - } - - // Handle images and image contents. - if (count($images) > 0) { - foreach ($images as $image) { - try { - $ocr = new OCR($this->storage->path($image)); - - $imageContents = $ocr->execute(); - - $contents = $contents . "\n" . $imageContents; - } catch (\Exception $exception) { - \Illuminate\Support\Facades\Log::info('something wrong: ' . $exception->getMessage()); - } - } - - $dir = str_replace('.pdf', '', $this->path); - - $this->storage->deleteDirectory($dir); - } - - $this->storage->delete($this->path); - - $this->path = str_replace('.pdf', '.txt', $this->path); - - $this->storage->put($this->path, $contents); - } - - protected function convertPdfToMD() - { -// $this->prepareForConvertPDF(); - - $result = $this->getContentsFromPdf(); - - if ( ! $result['has_images'] && ! $result['has_text']) { - throw new \Exception('Cannot get pdf file contents.'); - } - - if ($result['has_text']) { - if ($result['has_images']) { - // Both text and images. - throw new \Exception('Not supported for now.'); - } - - // Delete directory because the contents are in the '$result' variable. - $this->storage->deleteDirectory($this->path); - - $mdContents = ''; - - foreach ($result['htmls'] as $html) { - $converter = new HtmlConverter(); - $converter->getConfig()->setOption('strip_tags', true); - - $contents = $converter->convert($html); - - $mdContents = $mdContents . $contents; - } - - $this->path = "$this->path.md"; - - $this->storage->put($this->path, $mdContents); - - return; - } - - // Only contains images. - $imagesContent = ''; - $files = $this->storage->allFiles($this->path); - - foreach ($files as $file) { - // Only get the image files from the directory, it may contain some empty html files too. - if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { - $ocr = new OCR($this->storage->path($file)); - - $imagesContent = $imagesContent . $ocr->execute(); - } - } - - \Illuminate\Support\Facades\Log::info('============================'); - \Illuminate\Support\Facades\Log::info($this->path); - - // We are done with the images processing, delete directory. - $this->storage->deleteDirectory($this->path); - - $this->path = "$this->path.md"; - - \Illuminate\Support\Facades\Log::info($this->path); - \Illuminate\Support\Facades\Log::info('++++++++++++++++++++++++++'); - - $this->storage->put($this->path, $imagesContent); } private function convertToHtml() @@ -262,248 +88,4 @@ class Convertor $this->path = str_replace(".$this->type", '.xml', $this->path); } - - protected function prepareForConvertPDF() - { - (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); - - $process = new Process([ - 'pip3', - 'install', - 'pdftotext', - ]); - - $process->run(); - - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); - } - } - - protected function getImagesFromPDF() - { - $dir = str_replace('.pdf', '', $this->path); - - $this->storage->makeDirectory($dir); - - $process = new Process([ - 'pdfimages', - '-p', - $this->storage->path($this->path), - '-tiff', - $this->storage->path("$dir/ocr") - ]); - - $process->run(); - - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); - } - - return $this->storage->allFiles($dir); - } - - protected function getTextContentsFromPDF() - { - $outputPath = $this->storage->path(str_replace('.pdf', '.txt', $this->path)); - - $process = new Process([ - 'python3', - storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'), - '-i', - $this->storage->path($this->path), - '-o', - $outputPath - ]); - - $process->run(); - - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); - } - - return file_get_contents($outputPath); - } - - protected function getHtmlContentsFromPdfWithImages() - { - $dirName = str_replace('.pdf', '', $this->path); - $this->storage->makeDirectory($dirName); - - $outputPath = $this->storage->path("$dirName/html"); - - $process = new Process([ - 'pdftohtml', - '-noframes', - $this->storage->path($this->path), - $outputPath - ]); - - $process->run(); - - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); - } - - $this->storage->delete($this->path); - - $this->path = $dirName; - - $converter = new HtmlConverter(); - $converter->getConfig()->setOption('strip_tags', true); - - $files = $this->storage->allFiles($this->path); - - $htmlFileIndex = null; - - foreach ($files as $index => $file) { - // if (pathinfo($file, PATHINFO_BASENAME) === 'html-html.html') { - // if (pathinfo($file, PATHINFO_EXTENSION) === 'html') { - if (pathinfo($file, PATHINFO_BASENAME) === 'html.html') { - $htmlFileIndex = $index; - - break; - } - } - - $htmlContents = $this->storage->get($files[$htmlFileIndex]); - $contents = $converter->convert($htmlContents); - -// $this->storage->deleteDirectory($this->path); - - $this->path = "$this->path.md"; - - $this->storage->put($this->path, $contents); - - dd(3); - } - - protected function getContentsFromPdf() - { - $dirName = str_replace('.pdf', '', $this->path); - $this->storage->makeDirectory($dirName); - - $outputPath = $this->storage->path("$dirName/html"); - - $process = new Process([ - 'pdftohtml', - '-xml', - $this->storage->path($this->path), - $outputPath - ]); - - $process->run(); - - if (!$process->isSuccessful()) { - throw new ProcessFailedException($process); - } - - $this->storage->delete($this->path); - - $this->path = $dirName; - - $contents = $this->storage->get("$this->path/html.xml"); - - $xml = simplexml_load_string($contents); - - $fonts = []; - - foreach ($xml->page as $page) { - foreach ($page as $p) { - if ($p->getName() === 'fontspec') { - $fonts[(int) $p['id']]['family'] = (string) $p['family']; - $fonts[(int) $p['id']]['size'] = (string) $p['size']; - $fonts[(int) $p['id']]['color'] = (string) $p['color']; - } - } - } - - $htmls = []; - $hasImages = false; - $hasText = false; - - try { - foreach ($xml->page as $page) { - $html = ''; - - $previousP = null; - - foreach ($page as $p) { - if ($p->getName() == 'image') { - $html = $html . ''; - - $hasImages = true; - } - - if ($p->getName() == 'text') { - $id = (int) $p['font']; - $font_size = $fonts[$id]['size']; - $font_color = $fonts[$id]['color']; - $font_family = $fonts[$id]['family']; - - $style = ''; - $style = $style . 'position: absolute;'; - $style = $style . "color: $font_color;"; - $style = $style . "font-family: $font_family;"; - $style = $style . "font-weight: 900;"; - $style = $style . "width: " . $p['width'] . "px;"; - $style = $style . "height: " . $p['height'] . "px;"; - $style = $style . "top: " . $p['top'] . "px;"; - $style = $style . "left: " . $p['left'] . "px;"; - -// $style = $style . "font-size: $font_size" . "px;"; - - if ($p->i) { - $content = '' . $p->i . ''; - } else if ($p->b) { - $content = '' . $p->b . ''; - } else { - $content = $p; - } - - // @TODO Must chain paragraphs if top are almost same. - - $tag = $this->getTag($p, $previousP, $font_size); - - $html = $html . '<' . $tag . ' style="' . $style . '">' . $content . ''; - - $hasText = true; - } - - $previousP = $p; - } - - $htmls[] = '' . $html . ''; - } - } catch (\Exception $exception) { - \Illuminate\Support\Facades\Log::info($exception->getTraceAsString()); - } - - return [ - 'has_images' => $hasImages, - 'has_text' => $hasText, - 'htmls' => $htmls, - ]; - } - - protected function getTag($p, $previousP, $size) - { - if ($size > 24) { - return 'h1'; - } - - if ($size > 18) { - return 'h2'; - } - - if ($size > 16) { - return 'h3'; - } - - if ($previousP && $p['top'] - $previousP['top'] <= 5) { - return 'span'; - } - - return 'p'; - } } diff --git a/app/Ingest/DocumentHandler.php b/app/Ingest/DocumentHandler.php index 25b4963..38e4dfe 100644 --- a/app/Ingest/DocumentHandler.php +++ b/app/Ingest/DocumentHandler.php @@ -53,8 +53,10 @@ class DocumentHandler $type = $this->supportedFiles[$mimeType]; - $path = $storage->putFileAs("contracts", $file, "$this->id.$type"); + $id = str_replace(' ', '_', $this->id); - IngestDocuments::dispatch($path, $type); + $path = $storage->putFileAs("contracts/$id", $file, "document.$type"); + + IngestDocuments::dispatch($this->id, $path, $type); } } diff --git a/app/Ingest/DocxConvertor.php b/app/Ingest/DocxConvertor.php new file mode 100644 index 0000000..421a408 --- /dev/null +++ b/app/Ingest/DocxConvertor.php @@ -0,0 +1,46 @@ +convertToText(); + + $convertor = new TextConvertor($this->storage, "$this->directoryPath/document.txt"); + + $convertor->execute(); + } + + /** + * Convert docx file to text + * + * @return void + */ + protected function convertToText() + { + (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); + + $process = new Process([ + 'soffice', + '--headless', + '--convert-to', + 'txt', + $this->storage->path($this->path), + '--outdir', + $this->storage->path($this->directoryPath) + ]); + + $process->run(); + + if (!$process->isSuccessful()) { + throw new ProcessFailedException($process); + } + + $this->deleteOriginalDocument(); + } +} diff --git a/app/Ingest/OtherConvertor.php b/app/Ingest/OtherConvertor.php new file mode 100644 index 0000000..53f6839 --- /dev/null +++ b/app/Ingest/OtherConvertor.php @@ -0,0 +1,50 @@ +convertToDocx(); + + $convertor = new DocxConvertor($this->storage, "$this->directoryPath/document.docx"); + + $convertor->execute(); + } + + /** + * Convert doc,dot,rtf,odt,pdf,docx to docx + * + * + * @return string|void + */ + private function convertToDocx() + { + (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); + + /** + * Convert doc,dot,rtf,odt to docx + */ + $process = new Process([ + 'soffice', + '--headless', + '--convert-to', + 'docx', + $this->storage->path($this->path), + '--outdir', + $this->storage->path($this->directoryPath) + ]); + + $process->run(); + + if (!$process->isSuccessful()) { + throw new ProcessFailedException($process); + } + + $this->deleteOriginalDocument(); + } +} diff --git a/app/Ingest/PDFConvertor.php b/app/Ingest/PDFConvertor.php new file mode 100644 index 0000000..2b86d7e --- /dev/null +++ b/app/Ingest/PDFConvertor.php @@ -0,0 +1,271 @@ +prepareForConvertPDF(); + + $result = $this->getFileContents(); + + if ( ! $result['has_images'] && ! $result['has_text']) { + throw new \Exception('Cannot get pdf file contents.'); + } + + if ($result['has_text']) { + $mdContents = ''; + + foreach ($result['htmls'] as $html) { + $converter = new HtmlConverter(); + $converter->getConfig()->setOption('strip_tags', true); + + $contents = $converter->convert($html); + + $mdContents = $mdContents . "\n\n" . $contents; + } + + $this->storage->put("$this->directoryPath/document.md", $mdContents); + + return; + } + + // Only contains images. + $imagesContent = ''; + $files = $this->storage->allFiles($this->path); + + foreach ($files as $file) { + // Only get the image files from the directory, it may contain some empty html files too. + + // @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep. + if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { + $ocr = new OCR($this->storage->path($file)); + + $imagesContent = $imagesContent . $ocr->execute(); + + $this->storage->delete($file); + } + } + + $this->storage->put("$this->directoryPath/document.md", $imagesContent); + } + + protected function getFileContents() + { + $outputPath = $this->storage->path("$this->directoryPath/html"); + + $process = new Process([ + 'pdftohtml', + '-xml', + $this->storage->path($this->path), + $outputPath + ]); + + $process->run(); + + if (!$process->isSuccessful()) { + throw new ProcessFailedException($process); + } + + // Remove original document. + $this->storage->delete($this->path); + + return $this->getDataFromXML(); + } + + protected function getDataFromXML() + { + $xmlFilePath = "$this->directoryPath/html.xml"; + + $contents = $this->storage->get($xmlFilePath); + + $xml = simplexml_load_string($contents); + + $orderedList = []; + $fonts = []; + + foreach ($xml->page as $page) { + $pageNumber = (int) $page['number'][0]; + + $orderedList[$pageNumber] = []; + + foreach ($page as $p) { + if ($p->getName() === 'fontspec') { + $fonts[(int) $p['id']]['family'] = (string) $p['family']; + $fonts[(int) $p['id']]['size'] = (string) $p['size']; + $fonts[(int) $p['id']]['color'] = (string) $p['color']; + } + + if (isset($p['top'])) { + $top = (int) $p['top']; + + if ( ! array_key_exists($top, $orderedList[$pageNumber])) { + $orderedList[$pageNumber][$top] = []; + } + + $orderedList[$pageNumber][$top][] = $p; + } + } + + ksort($orderedList[$pageNumber]); + } + + $htmls = []; + $hasImages = false; + $hasText = false; + + $imagesCount = 0; + $imagesInFooter = true; + + try { + foreach ($orderedList as $page) { + $html = ''; + $footerImages = []; + + foreach ($page as $items) { + $continuousP = ''; + + foreach ($items as $p) { + if ($p->getName() == 'image') { + $hasImages = true; + + $imagesCount += 1; + $caption = "Fig. $imagesCount"; + + $imageHTML = $this->handleImage($p, $caption); + + if ( ! $imagesInFooter) { + $html = $html . $imageHTML; + } else { + $html = $html . "

$caption

"; + + $footerImages[] = $imageHTML; + } + } + + if ($p->getName() == 'text') { + $continuousP = $continuousP . $this->handleText($p, $fonts); + + $hasText = true; + } + } + + $html = $html . '

' . $continuousP . '

'; + } + + if ($imagesInFooter) { + foreach ($footerImages as $index => $footerImage) { + $html = $html . '

' . $footerImage . '

'; +// $html = $html . '

Fig. ' . ($index + 1) . '

'; + } + } + + $htmls[] = '' . $html . ''; + } + } catch (\Exception $exception) { + $this->storage->deleteDirectory($this->directoryPath); + + \Illuminate\Support\Facades\Log::info($exception->getTraceAsString()); + + throw new \Exception('Something went wrong.'); + } + + if ( ! $hasText && ! $hasImages) { + // Remove directory because we do not have any use for it anymore. + $this->storage->deleteDirectory($this->directoryPath); + } else { + // Remove the unnecessary 'xml' file. + $this->storage->delete($xmlFilePath); + } + + return [ + 'has_images' => $hasImages, + 'has_text' => $hasText, + 'htmls' => $htmls, + ]; + } + + protected function handleImage($p, $caption) + { + $html = ''; + + $src = './' . pathinfo($p['src'], PATHINFO_BASENAME); + + $html = $html . '
'; + $html = $html . '' . $caption . ''; + $html = $html . '
'; + $html = $html . '
'; + + return $html; + } + + protected function handleText($p, $fonts) + { + $id = (int) $p['font']; + $font_size = $fonts[$id]['size']; + $font_color = $fonts[$id]['color']; + $font_family = $fonts[$id]['family']; + + $style = ''; + $style = $style . 'position: absolute;'; + $style = $style . "color: $font_color;"; + $style = $style . "font-family: $font_family;"; + $style = $style . "font-weight: 900;"; + $style = $style . "width: " . $p['width'] . "px;"; + $style = $style . "height: " . $p['height'] . "px;"; + $style = $style . "top: " . $p['top'] . "px;"; + $style = $style . "left: " . $p['left'] . "px;"; + $style = $style . "font-size: $font_size" . "px;"; + + if ($p->i) { + $content = '' . $p->i . ''; + } else if ($p->b) { + $content = '' . $p->b . ''; + } else { + $content = $p; + } + + $tag = $this->getTag($font_size); + + return '<' . $tag . ' style="' . $style . '">' . $content . ''; + } + + protected function getTag($size) + { + if ($size > 24) { + return 'h1'; + } + + if ($size > 18) { + return 'h2'; + } + + if ($size > 16) { + return 'h3'; + } + + return 'span'; + } + + protected function prepareForConvertPDF() + { + (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); + + $process = new Process([ + 'pip3', + 'install', + 'pdftotext', + ]); + + $process->run(); + + if (!$process->isSuccessful()) { + throw new ProcessFailedException($process); + } + } +} diff --git a/app/Ingest/TextConvertor.php b/app/Ingest/TextConvertor.php new file mode 100644 index 0000000..5630d6a --- /dev/null +++ b/app/Ingest/TextConvertor.php @@ -0,0 +1,52 @@ +fromFile($this->storage->path($this->path)); + + if ( ! $content) { + throw new \Exception('Could not read content.'); + } + + $content = $this->convertToUTF8($content); + + $this->storeContent($content); + } + + protected function convertToUTF8($content) + { + array_walk_recursive( + $content, + function (&$entry) { + $entry = mb_convert_encoding( + $entry, + 'UTF-8' + ); + } + ); + + return $content; + } + + protected function storeContent($content) + { + $this->storeMD($content); + + $this->deleteOriginalDocument(); + } + + protected function storeMD($content) + { + $convertor = new MDConvertor($content); + + $this->storage->put("$this->directoryPath/document.md", $convertor->execute()); + } +} diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php index 7720ccb..1c08f87 100644 --- a/app/Jobs/IngestDocuments.php +++ b/app/Jobs/IngestDocuments.php @@ -3,12 +3,10 @@ namespace App\Jobs; use App\Ingest\Convertor; -use App\Ingest\MDConvertor; use App\Parser\ParseXml; use App\Parser\DocxParser\ParseDocx; use App\Parser\HtmlParser\ParseHtml; use App\Parser\ParseHtmlArray; -use App\Parser\ParseTextArray; use Illuminate\Bus\Queueable; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Bus\Dispatchable; @@ -20,6 +18,7 @@ class IngestDocuments implements ShouldQueue { use Dispatchable, InteractsWithQueue, Queueable; + protected $id; private $path; protected $type; @@ -51,11 +50,13 @@ class IngestDocuments implements ShouldQueue /** * Create a new job instance. * + * @param $id * @param string $path * @param $type */ - public function __construct(string $path, $type) + public function __construct($id, string $path, $type) { + $this->id = $id; $this->path = $path; $this->type = $type; @@ -76,7 +77,7 @@ class IngestDocuments implements ShouldQueue $convertor = new Convertor($this->path, $this->type); try { - $this->path = $convertor->execute(); + $convertor->execute(); } catch (\Exception $exception) { \Illuminate\Support\Facades\Log::info($exception->getMessage()); @@ -85,30 +86,7 @@ class IngestDocuments implements ShouldQueue return; } - // @TODO Replace later, the convertor will create the .md file. - if ($this->type !== 'pdf') { - $content = $this->getContent(); - - if ( ! $content) { - $this->failed(); - - return; - } - - $content = $this->convertToUTF8($content); - - try { - $filePath = $this->storeContent($content); - } catch (\Exception $e) { - Log::error('Error writing in to the file: ' . $e->getMessage()); - -// report($e); - } - } else { - $filePath = $this->path; - } - - SendToCore::dispatch($filePath); + SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME)); } public function failed() @@ -124,73 +102,6 @@ class IngestDocuments implements ShouldQueue // $this->storage->delete($this->path); // } - SendToCore::dispatch($this->path, true); - } - - protected function getContent() - { - if ($this->type === 'pdf') { - // Wait while it finishes. - while (!$this->storage->exists($this->path)) { - sleep(1); - } - - $textParser = new ParseTextArray(true); - - return $textParser->fromFile($this->storage->path($this->path)); - } - - $textParser = new ParseTextArray(); - - return $textParser->fromFile($this->storage->path($this->path)); - } - - protected function convertToUTF8($content) - { - array_walk_recursive( - $content, - function (&$entry) { - $entry = mb_convert_encoding( - $entry, - 'UTF-8' - ); - } - ); - - return $content; - } - - protected function storeContent($content) - { - $result = explode('.', $this->path); - $name = $result[0]; - - // Or json? - $filePath = $this->storeMD($name, $content); - - // Delete converted file. We now have the .md file. - $this->storage->delete($this->path); - - return $filePath; - } - - protected function storeMD($name, $content) - { - $fileName = "$name.md"; - - $convertor = new MDConvertor($content); - - $this->storage->put($fileName, $convertor->execute()); - - return $fileName; - } - - protected function storeJson($name, $content) - { - $fileName = "$name.json"; - - $this->storage->put($fileName, $content); - - return $fileName; + SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME), true); } } diff --git a/app/Jobs/SendToCore.php b/app/Jobs/SendToCore.php index d1acd3e..b6d7779 100644 --- a/app/Jobs/SendToCore.php +++ b/app/Jobs/SendToCore.php @@ -18,7 +18,7 @@ class SendToCore implements ShouldQueue private $secret; - private $filePath; + private $directoryPath; private $id; @@ -32,19 +32,18 @@ class SendToCore implements ShouldQueue /** * Create a new job instance. * - * @param null $filePath + * @param $id + * @param null $directoryPath * @param bool $hasFailed */ - public function __construct($filePath = null, $hasFailed = false) + public function __construct($id, $directoryPath = null, $hasFailed = false) { $this->url = env('WEBHOOK_CORE_URL') . '/webhooks'; $this->secret = env('WEBHOOK_CORE_SECRET'); - $this->filePath = $filePath; - $this->hasFailed = $hasFailed; - $string = str_replace('contracts/', '', $this->filePath); - $result = explode('.', $string); - $this->id = $result[0]; + $this->id = $id; + $this->directoryPath = $directoryPath; + $this->hasFailed = $hasFailed; } /** @@ -55,70 +54,99 @@ class SendToCore implements ShouldQueue */ public function handle() { - $content = ''; + $content = []; - // File exists, send content. - if ($this->filePath && ! $this->hasFailed) { + // Directory exists, send content. + if ($this->directoryPath && ! $this->hasFailed) { $this->storage = Storage::disk('local'); // @TODO Check if the file exists multiple times? - if ( ! $this->storage->exists($this->filePath)) { + if ( ! $this->storage->exists($this->directoryPath)) { throw new \Exception('File does not exist yet.'); } - $content = $this->storage->get($this->filePath); + $content = $this->getContent(); } $sent = $this->sendTheData($content); -// if ($this->filePath && $sent) { - if ($this->filePath) { +// if ($this->directoryPath && $sent) { + if ($this->directoryPath) { if ( ! $this->storage) { $this->storage = Storage::disk('local'); } - $this->storage->delete($this->filePath); + $this->storage->deleteDirectory($this->directoryPath); } } public function failed() { - if ($this->filePath) { + if ($this->directoryPath) { if ( ! $this->storage) { $this->storage = Storage::disk('local'); } - $this->storage->delete($this->filePath); + $this->storage->delete($this->directoryPath); } } /** - * Send the data to the core trough webhooks + * Send the data to the core through webhooks * * @param $content * @return bool */ - protected function sendTheData($content) + protected function sendTheData(array $content) { try { WebhookCall::create() ->url($this->url) ->payload(['data' => [ 'id' => $this->id, - 'content' => $this->encodeContent($content), - 'status' => $content ? 'success' : 'fail', + 'content' => $content, + 'status' => count($content) > 0 ? 'success' : 'fail', ]]) ->useSecret($this->secret) ->dispatch(); return true; } catch (\Exception $exception) { - Log::error('SendToCore@sendTheData' . $exception->getMessage()); + Log::error('SendToCore@sendTheData: ' . $exception->getMessage()); return false; } } + protected function getContent() + { + $document = $this->storage->get("$this->directoryPath/document.md"); + $document = $this->encodeContent($document); + + $images = []; + + $allFiles = $this->storage->allFiles($this->directoryPath); + + foreach ($allFiles as $file) { + // @TODO We are using this check in the 'PDFConvertor' file, refactor and improve. + if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { + $name = pathinfo($file, PATHINFO_FILENAME); + $type = pathinfo($file, PATHINFO_EXTENSION); + + $images[] = [ + 'name' => $name, + 'type' => $type, + 'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)), + ]; + } + } + + return [ + 'document' => $document, + 'images' => $images, + ]; + } + protected function encodeContent($content) { $encoding = mb_detect_encoding($content, 'UTF-8, ISO-8859-1, WINDOWS-1252, WINDOWS-1251', true);