Browse Source

Send document content as JSON.

master
Alex Puiu 2 years ago
parent
commit
fc45e0d695
  1. 2
      app/Http/Controllers/IngestController.php
  2. 64
      app/Ingest/PDFConvertor.php
  3. 23
      app/Jobs/IngestDocuments.php
  4. 28
      app/Jobs/SendToCore.php
  5. 343
      composer.lock

2
app/Http/Controllers/IngestController.php

@ -10,7 +10,7 @@ class IngestController extends Controller
{
request()->validate([
'id' => 'required',
'file_result_type' => 'required|in:md,original',
'file_result_type' => 'required|in:md,original,json',
'mime_type' => 'required',
'document' => 'required|file',
]);

64
app/Ingest/PDFConvertor.php

@ -17,7 +17,7 @@ class PDFConvertor extends AbstractConvertor
throw new \Exception('Cannot get pdf file contents.');
}
$this->storage->put("$this->directoryPath/document.md", $contents);
$this->storage->put("$this->directoryPath/document.json", json_encode($contents));
}
protected function getFileContents()
@ -87,6 +87,8 @@ class PDFConvertor extends AbstractConvertor
$mdContents = '';
$htmlContents = '';
$json = [];
$i = 0;
try {
foreach ($orderedList as $page) {
@ -103,9 +105,10 @@ class PDFConvertor extends AbstractConvertor
$basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']);
$textContents = $this->applyOCR($imageFilePath);
// $textContents = $this->applyOCR($imageFilePath); @@ uncomment
$textContents = null; // remove this
if ($textContents) {
if (false) {
$imageInFooter = true;
if ($html) {
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
@ -126,14 +129,17 @@ class PDFConvertor extends AbstractConvertor
$imagesCount += 1;
$caption = "Fig. $imagesCount";
$imageHTML = $this->handleImage($p, $caption);
$imageJSON = $this->handleImage($p, $caption);
if (!$imageInFooter) {
$html = $html . $imageHTML;
$json[$i]['tag'] = 'img';
$json[$i]['src'] = $imageJSON['src'];
$json[$i]['style'] = $imageJSON['style'];
$json[$i]['details'] = $caption;
} else {
$html = $html . "<p> $caption </p>";
$footerImages[] = $imageHTML;
$footerImages[] = $imageJSON;
}
}
}
@ -148,24 +154,20 @@ class PDFConvertor extends AbstractConvertor
$addition = '· ';
}
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText);
$continuousP = $this->handleText($p, $fonts, $addition, $firstOfText, true);
if($firstOfText) {
$json[$i]['tag'] = $continuousP['tag'];
$json[$i]['style'] = $continuousP['style'];
}
(isset($json[$i]['content'])) ? $json[$i]['content'] = $json[$i]['content'] . $continuousP['content'] : $json[$i]['content'] = $continuousP['content'];
$firstOfText = false;
$hasText = true;
}
}
$html = $html . '<p>' . $continuousP . '</p>';
}
if (!empty($footerImages)) {
foreach ($footerImages as $footerImage) {
$html = $html . '<p>' . $footerImage . '</p>';
}
$i++;
}
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
$htmlContents = $htmlContents . "<html><head></head><body>$html</body></html>";
}
} catch (\Exception $exception) {
$this->storage->deleteDirectory($this->directoryPath);
@ -183,21 +185,15 @@ class PDFConvertor extends AbstractConvertor
$this->storage->delete($xmlFilePath);
}
return $mdContents;
return $json;
}
protected function handleImage($p, $caption)
protected function handleImage($p)
{
$html = '';
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME);
$html = $html . '<br>';
$html = $html . '<img width=' . $p['width'] . ' ' . 'height=' . $p['height'] . ' src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
$html = $html . '<br>';
$html = $html . '<br>';
return $html;
return [
'src' => pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME),
'style' => 'width="' . $p['width'] . 'px" ' . 'height="' . $p['height'] . 'px"'
];
}
protected function handleText($p, $fonts, $addition = null, $firstOfText = false)
@ -232,7 +228,11 @@ class PDFConvertor extends AbstractConvertor
$tag = $this->getTag($font_size);
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
return [
'tag' => $tag,
'content' => (string) $content,
'style' => $style
];
}
protected function getTag($size)

23
app/Jobs/IngestDocuments.php

@ -83,13 +83,10 @@ class IngestDocuments implements ShouldQueue
protected function execute()
{
if ($this->fileResultType === 'md') {
$this->convertToMD();
return;
if ($this->fileResultType === 'json') {
$this->convertToJSON();
}
$this->convertToJsonData();
return;
}
/**
@ -97,25 +94,13 @@ class IngestDocuments implements ShouldQueue
*
* @throws \Exception
*/
protected function convertToMD()
protected function convertToJSON()
{
$convertor = new Convertor($this->path, $this->type);
$convertor->execute();
}
/**
* Convert document to JSON data file.
*
* @throws \Exception
*/
protected function convertToJsonData()
{
$convertor = new DataJsonConvertor($this->path, $this->type);
$convertor->execute();
}
public function failed()
{
if ( ! $this->storage) {

28
app/Jobs/SendToCore.php

@ -132,21 +132,19 @@ class SendToCore implements ShouldQueue
$images = [];
if ($extension === 'md') {
$allFiles = $this->storage->allFiles($this->directoryPath);
foreach ($allFiles as $file) {
// @TODO We are using this check in the 'PDFConvertor' file, refactor and improve.
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
$name = pathinfo($file, PATHINFO_FILENAME);
$type = pathinfo($file, PATHINFO_EXTENSION);
$images[] = [
'name' => $name,
'type' => $type,
'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)),
];
}
$allFiles = $this->storage->allFiles($this->directoryPath);
foreach ($allFiles as $file) {
// @TODO We are using this check in the 'PDFConvertor' file, refactor and improve.
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
$name = pathinfo($file, PATHINFO_FILENAME);
$type = pathinfo($file, PATHINFO_EXTENSION);
$images[] = [
'name' => $name,
'type' => $type,
'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)),
];
}
}

343
composer.lock

@ -4,7 +4,7 @@
"Read more about it at https://getcomposer.org/doc/01-basic-usage.md#installing-dependencies",
"This file is @generated automatically"
],
"content-hash": "c8ed2965a1b6b6e180ee0bf935ffbb26",
"content-hash": "1ab6fcba1c6e8e05f5f853dfcb39019b",
"packages": [
{
"name": "cebe/markdown",
@ -958,95 +958,6 @@
],
"time": "2020-05-18T15:13:39+00:00"
},
{
"name": "league/html-to-markdown",
"version": "5.0.0",
"source": {
"type": "git",
"url": "https://github.com/thephpleague/html-to-markdown.git",
"reference": "c4dbebbebe0fe454b6b38e6c683a977615bd7dc2"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/thephpleague/html-to-markdown/zipball/c4dbebbebe0fe454b6b38e6c683a977615bd7dc2",
"reference": "c4dbebbebe0fe454b6b38e6c683a977615bd7dc2",
"shasum": ""
},
"require": {
"ext-dom": "*",
"ext-xml": "*",
"php": "^7.2.5 || ^8.0"
},
"require-dev": {
"mikehaertl/php-shellcommand": "^1.1.0",
"phpstan/phpstan": "^0.12.82",
"phpunit/phpunit": "^8.5 || ^9.2",
"scrutinizer/ocular": "^1.6",
"unleashedtech/php-coding-standard": "^2.7",
"vimeo/psalm": "^4.6"
},
"bin": [
"bin/html-to-markdown"
],
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "5.1-dev"
}
},
"autoload": {
"psr-4": {
"League\\HTMLToMarkdown\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"MIT"
],
"authors": [
{
"name": "Colin O'Dell",
"email": "colinodell@gmail.com",
"homepage": "https://www.colinodell.com",
"role": "Lead Developer"
},
{
"name": "Nick Cernis",
"email": "nick@cern.is",
"homepage": "http://modernnerd.net",
"role": "Original Author"
}
],
"description": "An HTML-to-markdown conversion helper for PHP",
"homepage": "https://github.com/thephpleague/html-to-markdown",
"keywords": [
"html",
"markdown"
],
"support": {
"issues": "https://github.com/thephpleague/html-to-markdown/issues",
"source": "https://github.com/thephpleague/html-to-markdown/tree/5.0.0"
},
"funding": [
{
"url": "https://www.colinodell.com/sponsor",
"type": "custom"
},
{
"url": "https://www.paypal.me/colinpodell/10.00",
"type": "custom"
},
{
"url": "https://github.com/colinodell",
"type": "github"
},
{
"url": "https://www.patreon.com/colinodell",
"type": "patreon"
}
],
"time": "2021-03-29T01:29:08+00:00"
},
{
"name": "monolog/monolog",
"version": "2.1.0",
@ -1359,207 +1270,6 @@
],
"time": "2018-07-02T15:55:56+00:00"
},
{
"name": "pclzip/pclzip",
"version": "2.8.2",
"source": {
"type": "git",
"url": "https://github.com/ivanlanin/pclzip.git",
"reference": "19dd1de9d3f5fc4d7d70175b4c344dee329f45fd"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/ivanlanin/pclzip/zipball/19dd1de9d3f5fc4d7d70175b4c344dee329f45fd",
"reference": "19dd1de9d3f5fc4d7d70175b4c344dee329f45fd",
"shasum": ""
},
"type": "library",
"autoload": {
"classmap": [
"pclzip.lib.php"
]
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"LGPL-2.1"
],
"authors": [
{
"name": "Vincent Blavet"
}
],
"description": "A PHP library that offers compression and extraction functions for Zip formatted archives",
"homepage": "http://www.phpconcept.net/pclzip",
"keywords": [
"php",
"zip"
],
"time": "2014-06-05T11:42:24+00:00"
},
{
"name": "phpoffice/common",
"version": "0.2.9",
"source": {
"type": "git",
"url": "https://github.com/PHPOffice/Common.git",
"reference": "edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/PHPOffice/Common/zipball/edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d",
"reference": "edb5d32b1e3400a35a5c91e2539ed6f6ce925e4d",
"shasum": ""
},
"require": {
"pclzip/pclzip": "^2.8",
"php": ">=5.3.0"
},
"require-dev": {
"phpdocumentor/phpdocumentor": "2.*",
"phploc/phploc": "2.*",
"phpmd/phpmd": "2.*",
"phpunit/phpunit": "^4.8.36 || ^7.0",
"sebastian/phpcpd": "2.*",
"squizlabs/php_codesniffer": "2.*"
},
"type": "library",
"autoload": {
"psr-4": {
"PhpOffice\\Common\\": "src/Common/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"LGPL"
],
"authors": [
{
"name": "Mark Baker"
},
{
"name": "Franck Lefevre",
"homepage": "http://rootslabs.net"
}
],
"description": "PHPOffice Common",
"homepage": "http://phpoffice.github.io",
"keywords": [
"common",
"component",
"office",
"php"
],
"time": "2018-07-13T14:12:34+00:00"
},
{
"name": "phpoffice/phpword",
"version": "0.17.0",
"source": {
"type": "git",
"url": "https://github.com/PHPOffice/PHPWord.git",
"reference": "b8346af548d399acd9e30fc76ab0c55c2fec03a5"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/PHPOffice/PHPWord/zipball/b8346af548d399acd9e30fc76ab0c55c2fec03a5",
"reference": "b8346af548d399acd9e30fc76ab0c55c2fec03a5",
"shasum": ""
},
"require": {
"ext-xml": "*",
"php": "^5.3.3 || ^7.0",
"phpoffice/common": "^0.2.9",
"zendframework/zend-escaper": "^2.2"
},
"require-dev": {
"dompdf/dompdf": "0.8.*",
"ext-gd": "*",
"ext-zip": "*",
"friendsofphp/php-cs-fixer": "^2.2",
"mpdf/mpdf": "5.7.4 || 6.* || 7.*",
"php-coveralls/php-coveralls": "1.1.0 || ^2.0",
"phploc/phploc": "2.* || 3.* || 4.*",
"phpmd/phpmd": "2.*",
"phpunit/phpunit": "^4.8.36 || ^7.0",
"squizlabs/php_codesniffer": "^2.9",
"tecnickcom/tcpdf": "6.*"
},
"suggest": {
"dompdf/dompdf": "Allows writing PDF",
"ext-gd2": "Allows adding images",
"ext-xmlwriter": "Allows writing OOXML and ODF",
"ext-xsl": "Allows applying XSL style sheet to headers, to main document part, and to footers of an OOXML template",
"ext-zip": "Allows writing OOXML and ODF"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-develop": "0.18-dev"
}
},
"autoload": {
"psr-4": {
"PhpOffice\\PhpWord\\": "src/PhpWord"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"LGPL-3.0"
],
"authors": [
{
"name": "Mark Baker"
},
{
"name": "Gabriel Bull",
"email": "me@gabrielbull.com",
"homepage": "http://gabrielbull.com/"
},
{
"name": "Franck Lefevre",
"homepage": "https://rootslabs.net/blog/"
},
{
"name": "Ivan Lanin",
"homepage": "http://ivan.lanin.org"
},
{
"name": "Roman Syroeshko",
"homepage": "http://ru.linkedin.com/pub/roman-syroeshko/34/a53/994/"
},
{
"name": "Antoine de Troostembergh"
}
],
"description": "PHPWord - A pure PHP library for reading and writing word processing documents (OOXML, ODF, RTF, HTML, PDF)",
"homepage": "http://phpoffice.github.io",
"keywords": [
"ISO IEC 29500",
"OOXML",
"Office Open XML",
"OpenDocument",
"OpenXML",
"PhpOffice",
"PhpWord",
"Rich Text Format",
"WordprocessingML",
"doc",
"docx",
"html",
"odf",
"odt",
"office",
"pdf",
"php",
"reader",
"rtf",
"template",
"template processor",
"word",
"writer"
],
"time": "2019-10-01T20:43:33+00:00"
},
{
"name": "phpoption/phpoption",
"version": "1.7.4",
@ -3876,52 +3586,6 @@
"environment"
],
"time": "2020-06-02T14:08:54+00:00"
},
{
"name": "zendframework/zend-escaper",
"version": "2.6.1",
"source": {
"type": "git",
"url": "https://github.com/zendframework/zend-escaper.git",
"reference": "3801caa21b0ca6aca57fa1c42b08d35c395ebd5f"
},
"dist": {
"type": "zip",
"url": "https://api.github.com/repos/zendframework/zend-escaper/zipball/3801caa21b0ca6aca57fa1c42b08d35c395ebd5f",
"reference": "3801caa21b0ca6aca57fa1c42b08d35c395ebd5f",
"shasum": ""
},
"require": {
"php": "^5.6 || ^7.0"
},
"require-dev": {
"phpunit/phpunit": "^5.7.27 || ^6.5.8 || ^7.1.2",
"zendframework/zend-coding-standard": "~1.0.0"
},
"type": "library",
"extra": {
"branch-alias": {
"dev-master": "2.6.x-dev",
"dev-develop": "2.7.x-dev"
}
},
"autoload": {
"psr-4": {
"Zend\\Escaper\\": "src/"
}
},
"notification-url": "https://packagist.org/downloads/",
"license": [
"BSD-3-Clause"
],
"description": "Securely and safely escape HTML, HTML attributes, JavaScript, CSS, and URLs",
"keywords": [
"ZendFramework",
"escaper",
"zf"
],
"abandoned": "laminas/laminas-escaper",
"time": "2019-09-05T20:03:20+00:00"
}
],
"packages-dev": [
@ -6007,8 +5671,9 @@
"prefer-stable": true,
"prefer-lowest": false,
"platform": {
"php": "^7.2"
"php": "^7.2",
"ext-json": "*"
},
"platform-dev": [],
"plugin-api-version": "2.0.0"
"plugin-api-version": "2.2.0"
}
Loading…
Cancel
Save