diff --git a/app/Http/Controllers/IngestController.php b/app/Http/Controllers/IngestController.php
index ca2200b..08fa795 100644
--- a/app/Http/Controllers/IngestController.php
+++ b/app/Http/Controllers/IngestController.php
@@ -10,7 +10,7 @@ class IngestController extends Controller
{
request()->validate([
'id' => 'required',
- 'file_result_type' => 'required|in:md,original',
+ 'file_result_type' => 'required|in:md,original,json',
'mime_type' => 'required',
'document' => 'required|file',
]);
diff --git a/app/Ingest/PDFConvertor.php b/app/Ingest/PDFConvertor.php
index 2f1865c..29a266f 100644
--- a/app/Ingest/PDFConvertor.php
+++ b/app/Ingest/PDFConvertor.php
@@ -17,7 +17,7 @@ class PDFConvertor extends AbstractConvertor
throw new \Exception('Cannot get pdf file contents.');
}
- $this->storage->put("$this->directoryPath/document.md", $contents);
+ $this->storage->put("$this->directoryPath/document.json", json_encode($contents));
}
protected function getFileContents()
@@ -87,6 +87,8 @@ class PDFConvertor extends AbstractConvertor
$mdContents = '';
$htmlContents = '';
+ $json = [];
+ $i = 0;
try {
foreach ($orderedList as $page) {
@@ -103,9 +105,10 @@ class PDFConvertor extends AbstractConvertor
$basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']);
- $textContents = $this->applyOCR($imageFilePath);
+ // $textContents = $this->applyOCR($imageFilePath); @@ uncomment
+ $textContents = null; // remove this
- if ($textContents) {
+ if (false) {
$imageInFooter = true;
if ($html) {
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
@@ -126,14 +129,17 @@ class PDFConvertor extends AbstractConvertor
$imagesCount += 1;
$caption = "Fig. $imagesCount";
- $imageHTML = $this->handleImage($p, $caption);
+ $imageJSON = $this->handleImage($p, $caption);
if (!$imageInFooter) {
- $html = $html . $imageHTML;
+ $json[$i]['tag'] = 'img';
+ $json[$i]['src'] = $imageJSON['src'];
+ $json[$i]['style'] = $imageJSON['style'];
+ $json[$i]['details'] = $caption;
} else {
$html = $html . "
$caption
";
- $footerImages[] = $imageHTML;
+ $footerImages[] = $imageJSON;
}
}
}
@@ -148,24 +154,20 @@ class PDFConvertor extends AbstractConvertor
$addition = 'ยท ';
}
- $continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText);
-
+ $continuousP = $this->handleText($p, $fonts, $addition, $firstOfText, true);
+ if($firstOfText) {
+ $json[$i]['tag'] = $continuousP['tag'];
+ $json[$i]['style'] = $continuousP['style'];
+ }
+
+ (isset($json[$i]['content'])) ? $json[$i]['content'] = $json[$i]['content'] . $continuousP['content'] : $json[$i]['content'] = $continuousP['content'];
+
$firstOfText = false;
$hasText = true;
}
}
-
- $html = $html . '' . $continuousP . '
';
- }
-
- if (!empty($footerImages)) {
- foreach ($footerImages as $footerImage) {
- $html = $html . '' . $footerImage . '
';
- }
+ $i++;
}
-
- $mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
- $htmlContents = $htmlContents . "$html";
}
} catch (\Exception $exception) {
$this->storage->deleteDirectory($this->directoryPath);
@@ -183,21 +185,15 @@ class PDFConvertor extends AbstractConvertor
$this->storage->delete($xmlFilePath);
}
- return $mdContents;
+ return $json;
}
- protected function handleImage($p, $caption)
+ protected function handleImage($p)
{
- $html = '';
-
- $src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME);
-
- $html = $html . '
';
- $html = $html . '';
- $html = $html . '
';
- $html = $html . '
';
-
- return $html;
+ return [
+ 'src' => pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME),
+ 'style' => 'width="' . $p['width'] . 'px" ' . 'height="' . $p['height'] . 'px"'
+ ];
}
protected function handleText($p, $fonts, $addition = null, $firstOfText = false)
@@ -232,7 +228,11 @@ class PDFConvertor extends AbstractConvertor
$tag = $this->getTag($font_size);
- return '<' . $tag . ' style="' . $style . '">' . $content . '' . $tag . '>';
+ return [
+ 'tag' => $tag,
+ 'content' => (string) $content,
+ 'style' => $style
+ ];
}
protected function getTag($size)
diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php
index 0e80904..f0a9ac0 100644
--- a/app/Jobs/IngestDocuments.php
+++ b/app/Jobs/IngestDocuments.php
@@ -83,13 +83,10 @@ class IngestDocuments implements ShouldQueue
protected function execute()
{
- if ($this->fileResultType === 'md') {
- $this->convertToMD();
-
- return;
+ if ($this->fileResultType === 'json') {
+ $this->convertToJSON();
}
-
- $this->convertToJsonData();
+ return;
}
/**
@@ -97,25 +94,13 @@ class IngestDocuments implements ShouldQueue
*
* @throws \Exception
*/
- protected function convertToMD()
+ protected function convertToJSON()
{
$convertor = new Convertor($this->path, $this->type);
$convertor->execute();
}
- /**
- * Convert document to JSON data file.
- *
- * @throws \Exception
- */
- protected function convertToJsonData()
- {
- $convertor = new DataJsonConvertor($this->path, $this->type);
-
- $convertor->execute();
- }
-
public function failed()
{
if ( ! $this->storage) {
diff --git a/app/Jobs/SendToCore.php b/app/Jobs/SendToCore.php
index 25a83c7..05d350d 100644
--- a/app/Jobs/SendToCore.php
+++ b/app/Jobs/SendToCore.php
@@ -132,21 +132,19 @@ class SendToCore implements ShouldQueue
$images = [];
- if ($extension === 'md') {
- $allFiles = $this->storage->allFiles($this->directoryPath);
-
- foreach ($allFiles as $file) {
- // @TODO We are using this check in the 'PDFConvertor' file, refactor and improve.
- if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
- $name = pathinfo($file, PATHINFO_FILENAME);
- $type = pathinfo($file, PATHINFO_EXTENSION);
-
- $images[] = [
- 'name' => $name,
- 'type' => $type,
- 'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)),
- ];
- }
+ $allFiles = $this->storage->allFiles($this->directoryPath);
+
+ foreach ($allFiles as $file) {
+ // @TODO We are using this check in the 'PDFConvertor' file, refactor and improve.
+ if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
+ $name = pathinfo($file, PATHINFO_FILENAME);
+ $type = pathinfo($file, PATHINFO_EXTENSION);
+
+ $images[] = [
+ 'name' => $name,
+ 'type' => $type,
+ 'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)),
+ ];
}
}
diff --git a/composer.lock b/composer.lock
index 6f0e730..a57d33e 100644
--- a/composer.lock
+++ b/composer.lock
@@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
- "content-hash": "c8ed2965a1b6b6e180ee0bf935ffbb26",
+ "content-hash": "1ab6fcba1c6e8e05f5f853dfcb39019b",
"packages": [
{
"name": "cebe/markdown",
@@ -958,95 +958,6 @@
],
"time": "2020-05-18T15:13:39+00:00"
},
- {
- "name": "league/html-to-markdown",
- "version": "5.0.0",
- "source": {
- "type": "git",
- "url": "https://github.com/thephpleague/html-to-markdown.git",
- "reference": "c4dbebbebe0fe454b6b38e6c683a977615bd7dc2"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/thephpleague/html-to-markdown/zipball/c4dbebbebe0fe454b6b38e6c683a977615bd7dc2",
- "reference": "c4dbebbebe0fe454b6b38e6c683a977615bd7dc2",
- "shasum": ""
- },
- "require": {
- "ext-dom": "*",
- "ext-xml": "*",
- "php": "^7.2.5 || ^8.0"
- },
- "require-dev": {
- "mikehaertl/php-shellcommand": "^1.1.0",
- "phpstan/phpstan": "^0.12.82",
- "phpunit/phpunit": "^8.5 || ^9.2",
- "scrutinizer/ocular": "^1.6",
- "unleashedtech/php-coding-standard": "^2.7",
- "vimeo/psalm": "^4.6"
- },
- "bin": [
- "bin/html-to-markdown"
- ],
- "type": "library",
- "extra": {
- "branch-alias": {
- "dev-master": "5.1-dev"
- }
- },
- "autoload": {
- "psr-4": {
- "League\\HTMLToMarkdown\\": "src/"
- }
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "MIT"
- ],
- "authors": [
- {
- "name": "Colin O'Dell",
- "email": "colinodell@gmail.com",
- "homepage": "https://www.colinodell.com",
- "role": "Lead Developer"
- },
- {
- "name": "Nick Cernis",
- "email": "nick@cern.is",
- "homepage": "http://modernnerd.net",
- "role": "Original Author"
- }
- ],
- "description": "An HTML-to-markdown conversion helper for PHP",
- "homepage": "https://github.com/thephpleague/html-to-markdown",
- "keywords": [
- "html",
- "markdown"
- ],
- "support": {
- "issues": "https://github.com/thephpleague/html-to-markdown/issues",
- "source": "https://github.com/thephpleague/html-to-markdown/tree/5.0.0"
- },
- "funding": [
- {
- "url": "https://www.colinodell.com/sponsor",
- "type": "custom"
- },
- {
- "url": "https://www.paypal.me/colinpodell/10.00",
- "type": "custom"
- },
- {
- "url": "https://github.com/colinodell",
- "type": "github"
- },
- {
- "url": "https://www.patreon.com/colinodell",
- "type": "patreon"
- }
- ],
- "time": "2021-03-29T01:29:08+00:00"
- },
{
"name": "monolog/monolog",
"version": "2.1.0",
@@ -1359,207 +1270,6 @@
],
"time": "2018-07-02T15:55:56+00:00"
},
- {
- "name": "pclzip/pclzip",
- "version": "2.8.2",
- "source": {
- "type": "git",
- "url": "https://github.com/ivanlanin/pclzip.git",
- "reference": "19dd1de9d3f5fc4d7d70175b4c344dee329f45fd"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/ivanlanin/pclzip/zipball/19dd1de9d3f5fc4d7d70175b4c344dee329f45fd",
- "reference": "19dd1de9d3f5fc4d7d70175b4c344dee329f45fd",
- "shasum": ""
- },
- "type": "library",
- "autoload": {
- "classmap": [
- "pclzip.lib.php"
- ]
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "LGPL-2.1"
- ],
- "authors": [
- {
- "name": "Vincent Blavet"
- }
- ],
- "description": "A PHP library that offers compression and extraction functions for Zip formatted archives",
- "homepage": "http://www.phpconcept.net/pclzip",
- "keywords": [
- "php",
- "zip"
- ],
- "time": "2014-06-05T11:42:24+00:00"
- },
- {
- "name": "phpoffice/common",
- "version": "0.2.9",
- "source": {
- "type": "git",
- "url": "https://github.com/PHPOffice/Common.git",
- "reference": "edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/PHPOffice/Common/zipball/edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d",
- "reference": "edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d",
- "shasum": ""
- },
- "require": {
- "pclzip/pclzip": "^2.8",
- "php": ">=5.3.0"
- },
- "require-dev": {
- "phpdocumentor/phpdocumentor": "2.*",
- "phploc/phploc": "2.*",
- "phpmd/phpmd": "2.*",
- "phpunit/phpunit": "^4.8.36 || ^7.0",
- "sebastian/phpcpd": "2.*",
- "squizlabs/php_codesniffer": "2.*"
- },
- "type": "library",
- "autoload": {
- "psr-4": {
- "PhpOffice\\Common\\": "src/Common/"
- }
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "LGPL"
- ],
- "authors": [
- {
- "name": "Mark Baker"
- },
- {
- "name": "Franck Lefevre",
- "homepage": "http://rootslabs.net"
- }
- ],
- "description": "PHPOffice Common",
- "homepage": "http://phpoffice.github.io",
- "keywords": [
- "common",
- "component",
- "office",
- "php"
- ],
- "time": "2018-07-13T14:12:34+00:00"
- },
- {
- "name": "phpoffice/phpword",
- "version": "0.17.0",
- "source": {
- "type": "git",
- "url": "https://github.com/PHPOffice/PHPWord.git",
- "reference": "b8346af548d399acd9e30fc76ab0c55c2fec03a5"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/PHPOffice/PHPWord/zipball/b8346af548d399acd9e30fc76ab0c55c2fec03a5",
- "reference": "b8346af548d399acd9e30fc76ab0c55c2fec03a5",
- "shasum": ""
- },
- "require": {
- "ext-xml": "*",
- "php": "^5.3.3 || ^7.0",
- "phpoffice/common": "^0.2.9",
- "zendframework/zend-escaper": "^2.2"
- },
- "require-dev": {
- "dompdf/dompdf": "0.8.*",
- "ext-gd": "*",
- "ext-zip": "*",
- "friendsofphp/php-cs-fixer": "^2.2",
- "mpdf/mpdf": "5.7.4 || 6.* || 7.*",
- "php-coveralls/php-coveralls": "1.1.0 || ^2.0",
- "phploc/phploc": "2.* || 3.* || 4.*",
- "phpmd/phpmd": "2.*",
- "phpunit/phpunit": "^4.8.36 || ^7.0",
- "squizlabs/php_codesniffer": "^2.9",
- "tecnickcom/tcpdf": "6.*"
- },
- "suggest": {
- "dompdf/dompdf": "Allows writing PDF",
- "ext-gd2": "Allows adding images",
- "ext-xmlwriter": "Allows writing OOXML and ODF",
- "ext-xsl": "Allows applying XSL style sheet to headers, to main document part, and to footers of an OOXML template",
- "ext-zip": "Allows writing OOXML and ODF"
- },
- "type": "library",
- "extra": {
- "branch-alias": {
- "dev-develop": "0.18-dev"
- }
- },
- "autoload": {
- "psr-4": {
- "PhpOffice\\PhpWord\\": "src/PhpWord"
- }
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "LGPL-3.0"
- ],
- "authors": [
- {
- "name": "Mark Baker"
- },
- {
- "name": "Gabriel Bull",
- "email": "me@gabrielbull.com",
- "homepage": "http://gabrielbull.com/"
- },
- {
- "name": "Franck Lefevre",
- "homepage": "https://rootslabs.net/blog/"
- },
- {
- "name": "Ivan Lanin",
- "homepage": "http://ivan.lanin.org"
- },
- {
- "name": "Roman Syroeshko",
- "homepage": "http://ru.linkedin.com/pub/roman-syroeshko/34/a53/994/"
- },
- {
- "name": "Antoine de Troostembergh"
- }
- ],
- "description": "PHPWord - A pure PHP library for reading and writing word processing documents (OOXML, ODF, RTF, HTML, PDF)",
- "homepage": "http://phpoffice.github.io",
- "keywords": [
- "ISO IEC 29500",
- "OOXML",
- "Office Open XML",
- "OpenDocument",
- "OpenXML",
- "PhpOffice",
- "PhpWord",
- "Rich Text Format",
- "WordprocessingML",
- "doc",
- "docx",
- "html",
- "odf",
- "odt",
- "office",
- "pdf",
- "php",
- "reader",
- "rtf",
- "template",
- "template processor",
- "word",
- "writer"
- ],
- "time": "2019-10-01T20:43:33+00:00"
- },
{
"name": "phpoption/phpoption",
"version": "1.7.4",
@@ -3876,52 +3586,6 @@
"environment"
],
"time": "2020-06-02T14:08:54+00:00"
- },
- {
- "name": "zendframework/zend-escaper",
- "version": "2.6.1",
- "source": {
- "type": "git",
- "url": "https://github.com/zendframework/zend-escaper.git",
- "reference": "3801caa21b0ca6aca57fa1c42b08d35c395ebd5f"
- },
- "dist": {
- "type": "zip",
- "url": "https://api.github.com/repos/zendframework/zend-escaper/zipball/3801caa21b0ca6aca57fa1c42b08d35c395ebd5f",
- "reference": "3801caa21b0ca6aca57fa1c42b08d35c395ebd5f",
- "shasum": ""
- },
- "require": {
- "php": "^5.6 || ^7.0"
- },
- "require-dev": {
- "phpunit/phpunit": "^5.7.27 || ^6.5.8 || ^7.1.2",
- "zendframework/zend-coding-standard": "~1.0.0"
- },
- "type": "library",
- "extra": {
- "branch-alias": {
- "dev-master": "2.6.x-dev",
- "dev-develop": "2.7.x-dev"
- }
- },
- "autoload": {
- "psr-4": {
- "Zend\\Escaper\\": "src/"
- }
- },
- "notification-url": "https://packagist.org/downloads/",
- "license": [
- "BSD-3-Clause"
- ],
- "description": "Securely and safely escape HTML, HTML attributes, JavaScript, CSS, and URLs",
- "keywords": [
- "ZendFramework",
- "escaper",
- "zf"
- ],
- "abandoned": "laminas/laminas-escaper",
- "time": "2019-09-05T20:03:20+00:00"
}
],
"packages-dev": [
@@ -6007,8 +5671,9 @@
"prefer-stable": true,
"prefer-lowest": false,
"platform": {
- "php": "^7.2"
+ "php": "^7.2",
+ "ext-json": "*"
},
"platform-dev": [],
- "plugin-api-version": "2.0.0"
+ "plugin-api-version": "2.2.0"
}