From fc45e0d69579c6f1b6096ab05d86e2a6ce7e010f Mon Sep 17 00:00:00 2001 From: Alex Puiu Date: Wed, 2 Mar 2022 18:01:25 +0200 Subject: [PATCH] Send document content as JSON. --- app/Http/Controllers/IngestController.php | 2 +- app/Ingest/PDFConvertor.php | 64 ++-- app/Jobs/IngestDocuments.php | 23 +- app/Jobs/SendToCore.php | 28 +- composer.lock | 343 +--------------------- 5 files changed, 54 insertions(+), 406 deletions(-) diff --git a/app/Http/Controllers/IngestController.php b/app/Http/Controllers/IngestController.php index ca2200b..08fa795 100644 --- a/app/Http/Controllers/IngestController.php +++ b/app/Http/Controllers/IngestController.php @@ -10,7 +10,7 @@ class IngestController extends Controller { request()->validate([ 'id' => 'required', - 'file_result_type' => 'required|in:md,original', + 'file_result_type' => 'required|in:md,original,json', 'mime_type' => 'required', 'document' => 'required|file', ]); diff --git a/app/Ingest/PDFConvertor.php b/app/Ingest/PDFConvertor.php index 2f1865c..29a266f 100644 --- a/app/Ingest/PDFConvertor.php +++ b/app/Ingest/PDFConvertor.php @@ -17,7 +17,7 @@ class PDFConvertor extends AbstractConvertor throw new \Exception('Cannot get pdf file contents.'); } - $this->storage->put("$this->directoryPath/document.md", $contents); + $this->storage->put("$this->directoryPath/document.json", json_encode($contents)); } protected function getFileContents() @@ -87,6 +87,8 @@ class PDFConvertor extends AbstractConvertor $mdContents = ''; $htmlContents = ''; + $json = []; + $i = 0; try { foreach ($orderedList as $page) { @@ -103,9 +105,10 @@ class PDFConvertor extends AbstractConvertor $basePath = $this->storage->path(''); $imageFilePath = str_replace($basePath, '', $p['src']); - $textContents = $this->applyOCR($imageFilePath); + // $textContents = $this->applyOCR($imageFilePath); @@ uncomment + $textContents = null; // remove this - if ($textContents) { + if (false) { $imageInFooter = true; if ($html) { $mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n"; @@ -126,14 +129,17 @@ class PDFConvertor extends AbstractConvertor $imagesCount += 1; $caption = "Fig. $imagesCount"; - $imageHTML = $this->handleImage($p, $caption); + $imageJSON = $this->handleImage($p, $caption); if (!$imageInFooter) { - $html = $html . $imageHTML; + $json[$i]['tag'] = 'img'; + $json[$i]['src'] = $imageJSON['src']; + $json[$i]['style'] = $imageJSON['style']; + $json[$i]['details'] = $caption; } else { $html = $html . "

$caption

"; - $footerImages[] = $imageHTML; + $footerImages[] = $imageJSON; } } } @@ -148,24 +154,20 @@ class PDFConvertor extends AbstractConvertor $addition = 'ยท '; } - $continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText); - + $continuousP = $this->handleText($p, $fonts, $addition, $firstOfText, true); + if($firstOfText) { + $json[$i]['tag'] = $continuousP['tag']; + $json[$i]['style'] = $continuousP['style']; + } + + (isset($json[$i]['content'])) ? $json[$i]['content'] = $json[$i]['content'] . $continuousP['content'] : $json[$i]['content'] = $continuousP['content']; + $firstOfText = false; $hasText = true; } } - - $html = $html . '

' . $continuousP . '

'; - } - - if (!empty($footerImages)) { - foreach ($footerImages as $footerImage) { - $html = $html . '

' . $footerImage . '

'; - } + $i++; } - - $mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n"; - $htmlContents = $htmlContents . "$html"; } } catch (\Exception $exception) { $this->storage->deleteDirectory($this->directoryPath); @@ -183,21 +185,15 @@ class PDFConvertor extends AbstractConvertor $this->storage->delete($xmlFilePath); } - return $mdContents; + return $json; } - protected function handleImage($p, $caption) + protected function handleImage($p) { - $html = ''; - - $src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME); - - $html = $html . '
'; - $html = $html . '' . $caption . ''; - $html = $html . '
'; - $html = $html . '
'; - - return $html; + return [ + 'src' => pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME), + 'style' => 'width="' . $p['width'] . 'px" ' . 'height="' . $p['height'] . 'px"' + ]; } protected function handleText($p, $fonts, $addition = null, $firstOfText = false) @@ -232,7 +228,11 @@ class PDFConvertor extends AbstractConvertor $tag = $this->getTag($font_size); - return '<' . $tag . ' style="' . $style . '">' . $content . ''; + return [ + 'tag' => $tag, + 'content' => (string) $content, + 'style' => $style + ]; } protected function getTag($size) diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php index 0e80904..f0a9ac0 100644 --- a/app/Jobs/IngestDocuments.php +++ b/app/Jobs/IngestDocuments.php @@ -83,13 +83,10 @@ class IngestDocuments implements ShouldQueue protected function execute() { - if ($this->fileResultType === 'md') { - $this->convertToMD(); - - return; + if ($this->fileResultType === 'json') { + $this->convertToJSON(); } - - $this->convertToJsonData(); + return; } /** @@ -97,25 +94,13 @@ class IngestDocuments implements ShouldQueue * * @throws \Exception */ - protected function convertToMD() + protected function convertToJSON() { $convertor = new Convertor($this->path, $this->type); $convertor->execute(); } - /** - * Convert document to JSON data file. - * - * @throws \Exception - */ - protected function convertToJsonData() - { - $convertor = new DataJsonConvertor($this->path, $this->type); - - $convertor->execute(); - } - public function failed() { if ( ! $this->storage) { diff --git a/app/Jobs/SendToCore.php b/app/Jobs/SendToCore.php index 25a83c7..05d350d 100644 --- a/app/Jobs/SendToCore.php +++ b/app/Jobs/SendToCore.php @@ -132,21 +132,19 @@ class SendToCore implements ShouldQueue $images = []; - if ($extension === 'md') { - $allFiles = $this->storage->allFiles($this->directoryPath); - - foreach ($allFiles as $file) { - // @TODO We are using this check in the 'PDFConvertor' file, refactor and improve. - if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { - $name = pathinfo($file, PATHINFO_FILENAME); - $type = pathinfo($file, PATHINFO_EXTENSION); - - $images[] = [ - 'name' => $name, - 'type' => $type, - 'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)), - ]; - } + $allFiles = $this->storage->allFiles($this->directoryPath); + + foreach ($allFiles as $file) { + // @TODO We are using this check in the 'PDFConvertor' file, refactor and improve. + if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { + $name = pathinfo($file, PATHINFO_FILENAME); + $type = pathinfo($file, PATHINFO_EXTENSION); + + $images[] = [ + 'name' => $name, + 'type' => $type, + 'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)), + ]; } } diff --git a/composer.lock b/composer.lock index 6f0e730..a57d33e 100644 --- a/composer.lock +++ b/composer.lock @@ -4,7 +4,7 @@ "Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies", "This file is @generated automatically" ], - "content-hash": "c8ed2965a1b6b6e180ee0bf935ffbb26", + "content-hash": "1ab6fcba1c6e8e05f5f853dfcb39019b", "packages": [ { "name": "cebe/markdown", @@ -958,95 +958,6 @@ ], "time": "2020-05-18T15:13:39+00:00" }, - { - "name": "league/html-to-markdown", - "version": "5.0.0", - "source": { - "type": "git", - "url": "https://github.com/thephpleague/html-to-markdown.git", - "reference": "c4dbebbebe0fe454b6b38e6c683a977615bd7dc2" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/thephpleague/html-to-markdown/zipball/c4dbebbebe0fe454b6b38e6c683a977615bd7dc2", - "reference": "c4dbebbebe0fe454b6b38e6c683a977615bd7dc2", - "shasum": "" - }, - "require": { - "ext-dom": "*", - "ext-xml": "*", - "php": "^7.2.5 || ^8.0" - }, - "require-dev": { - "mikehaertl/php-shellcommand": "^1.1.0", - "phpstan/phpstan": "^0.12.82", - "phpunit/phpunit": "^8.5 || ^9.2", - "scrutinizer/ocular": "^1.6", - "unleashedtech/php-coding-standard": "^2.7", - "vimeo/psalm": "^4.6" - }, - "bin": [ - "bin/html-to-markdown" - ], - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "5.1-dev" - } - }, - "autoload": { - "psr-4": { - "League\\HTMLToMarkdown\\": "src/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "MIT" - ], - "authors": [ - { - "name": "Colin O'Dell", - "email": "colinodell@gmail.com", - "homepage": "https://www.colinodell.com", - "role": "Lead Developer" - }, - { - "name": "Nick Cernis", - "email": "nick@cern.is", - "homepage": "http://modernnerd.net", - "role": "Original Author" - } - ], - "description": "An HTML-to-markdown conversion helper for PHP", - "homepage": "https://github.com/thephpleague/html-to-markdown", - "keywords": [ - "html", - "markdown" - ], - "support": { - "issues": "https://github.com/thephpleague/html-to-markdown/issues", - "source": "https://github.com/thephpleague/html-to-markdown/tree/5.0.0" - }, - "funding": [ - { - "url": "https://www.colinodell.com/sponsor", - "type": "custom" - }, - { - "url": "https://www.paypal.me/colinpodell/10.00", - "type": "custom" - }, - { - "url": "https://github.com/colinodell", - "type": "github" - }, - { - "url": "https://www.patreon.com/colinodell", - "type": "patreon" - } - ], - "time": "2021-03-29T01:29:08+00:00" - }, { "name": "monolog/monolog", "version": "2.1.0", @@ -1359,207 +1270,6 @@ ], "time": "2018-07-02T15:55:56+00:00" }, - { - "name": "pclzip/pclzip", - "version": "2.8.2", - "source": { - "type": "git", - "url": "https://github.com/ivanlanin/pclzip.git", - "reference": "19dd1de9d3f5fc4d7d70175b4c344dee329f45fd" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/ivanlanin/pclzip/zipball/19dd1de9d3f5fc4d7d70175b4c344dee329f45fd", - "reference": "19dd1de9d3f5fc4d7d70175b4c344dee329f45fd", - "shasum": "" - }, - "type": "library", - "autoload": { - "classmap": [ - "pclzip.lib.php" - ] - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "LGPL-2.1" - ], - "authors": [ - { - "name": "Vincent Blavet" - } - ], - "description": "A PHP library that offers compression and extraction functions for Zip formatted archives", - "homepage": "http://www.phpconcept.net/pclzip", - "keywords": [ - "php", - "zip" - ], - "time": "2014-06-05T11:42:24+00:00" - }, - { - "name": "phpoffice/common", - "version": "0.2.9", - "source": { - "type": "git", - "url": "https://github.com/PHPOffice/Common.git", - "reference": "edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/PHPOffice/Common/zipball/edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d", - "reference": "edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d", - "shasum": "" - }, - "require": { - "pclzip/pclzip": "^2.8", - "php": ">=5.3.0" - }, - "require-dev": { - "phpdocumentor/phpdocumentor": "2.*", - "phploc/phploc": "2.*", - "phpmd/phpmd": "2.*", - "phpunit/phpunit": "^4.8.36 || ^7.0", - "sebastian/phpcpd": "2.*", - "squizlabs/php_codesniffer": "2.*" - }, - "type": "library", - "autoload": { - "psr-4": { - "PhpOffice\\Common\\": "src/Common/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "LGPL" - ], - "authors": [ - { - "name": "Mark Baker" - }, - { - "name": "Franck Lefevre", - "homepage": "http://rootslabs.net" - } - ], - "description": "PHPOffice Common", - "homepage": "http://phpoffice.github.io", - "keywords": [ - "common", - "component", - "office", - "php" - ], - "time": "2018-07-13T14:12:34+00:00" - }, - { - "name": "phpoffice/phpword", - "version": "0.17.0", - "source": { - "type": "git", - "url": "https://github.com/PHPOffice/PHPWord.git", - "reference": "b8346af548d399acd9e30fc76ab0c55c2fec03a5" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/PHPOffice/PHPWord/zipball/b8346af548d399acd9e30fc76ab0c55c2fec03a5", - "reference": "b8346af548d399acd9e30fc76ab0c55c2fec03a5", - "shasum": "" - }, - "require": { - "ext-xml": "*", - "php": "^5.3.3 || ^7.0", - "phpoffice/common": "^0.2.9", - "zendframework/zend-escaper": "^2.2" - }, - "require-dev": { - "dompdf/dompdf": "0.8.*", - "ext-gd": "*", - "ext-zip": "*", - "friendsofphp/php-cs-fixer": "^2.2", - "mpdf/mpdf": "5.7.4 || 6.* || 7.*", - "php-coveralls/php-coveralls": "1.1.0 || ^2.0", - "phploc/phploc": "2.* || 3.* || 4.*", - "phpmd/phpmd": "2.*", - "phpunit/phpunit": "^4.8.36 || ^7.0", - "squizlabs/php_codesniffer": "^2.9", - "tecnickcom/tcpdf": "6.*" - }, - "suggest": { - "dompdf/dompdf": "Allows writing PDF", - "ext-gd2": "Allows adding images", - "ext-xmlwriter": "Allows writing OOXML and ODF", - "ext-xsl": "Allows applying XSL style sheet to headers, to main document part, and to footers of an OOXML template", - "ext-zip": "Allows writing OOXML and ODF" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-develop": "0.18-dev" - } - }, - "autoload": { - "psr-4": { - "PhpOffice\\PhpWord\\": "src/PhpWord" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "LGPL-3.0" - ], - "authors": [ - { - "name": "Mark Baker" - }, - { - "name": "Gabriel Bull", - "email": "me@gabrielbull.com", - "homepage": "http://gabrielbull.com/" - }, - { - "name": "Franck Lefevre", - "homepage": "https://rootslabs.net/blog/" - }, - { - "name": "Ivan Lanin", - "homepage": "http://ivan.lanin.org" - }, - { - "name": "Roman Syroeshko", - "homepage": "http://ru.linkedin.com/pub/roman-syroeshko/34/a53/994/" - }, - { - "name": "Antoine de Troostembergh" - } - ], - "description": "PHPWord - A pure PHP library for reading and writing word processing documents (OOXML, ODF, RTF, HTML, PDF)", - "homepage": "http://phpoffice.github.io", - "keywords": [ - "ISO IEC 29500", - "OOXML", - "Office Open XML", - "OpenDocument", - "OpenXML", - "PhpOffice", - "PhpWord", - "Rich Text Format", - "WordprocessingML", - "doc", - "docx", - "html", - "odf", - "odt", - "office", - "pdf", - "php", - "reader", - "rtf", - "template", - "template processor", - "word", - "writer" - ], - "time": "2019-10-01T20:43:33+00:00" - }, { "name": "phpoption/phpoption", "version": "1.7.4", @@ -3876,52 +3586,6 @@ "environment" ], "time": "2020-06-02T14:08:54+00:00" - }, - { - "name": "zendframework/zend-escaper", - "version": "2.6.1", - "source": { - "type": "git", - "url": "https://github.com/zendframework/zend-escaper.git", - "reference": "3801caa21b0ca6aca57fa1c42b08d35c395ebd5f" - }, - "dist": { - "type": "zip", - "url": "https://api.github.com/repos/zendframework/zend-escaper/zipball/3801caa21b0ca6aca57fa1c42b08d35c395ebd5f", - "reference": "3801caa21b0ca6aca57fa1c42b08d35c395ebd5f", - "shasum": "" - }, - "require": { - "php": "^5.6 || ^7.0" - }, - "require-dev": { - "phpunit/phpunit": "^5.7.27 || ^6.5.8 || ^7.1.2", - "zendframework/zend-coding-standard": "~1.0.0" - }, - "type": "library", - "extra": { - "branch-alias": { - "dev-master": "2.6.x-dev", - "dev-develop": "2.7.x-dev" - } - }, - "autoload": { - "psr-4": { - "Zend\\Escaper\\": "src/" - } - }, - "notification-url": "https://packagist.org/downloads/", - "license": [ - "BSD-3-Clause" - ], - "description": "Securely and safely escape HTML, HTML attributes, JavaScript, CSS, and URLs", - "keywords": [ - "ZendFramework", - "escaper", - "zf" - ], - "abandoned": "laminas/laminas-escaper", - "time": "2019-09-05T20:03:20+00:00" } ], "packages-dev": [ @@ -6007,8 +5671,9 @@ "prefer-stable": true, "prefer-lowest": false, "platform": { - "php": "^7.2" + "php": "^7.2", + "ext-json": "*" }, "platform-dev": [], - "plugin-api-version": "2.0.0" + "plugin-api-version": "2.2.0" }