diff --git a/README.md b/README.md
index 5d370b7..55d694f 100644
--- a/README.md
+++ b/README.md
@@ -70,6 +70,9 @@ cd Bin
# Dewarp
pip3 install opencv-python
+cd DEWARP_INSTALLATION_DIRECTORY
+pip3 install -r requirements.txt
+
# MAT2 (Metadata remover) - Not used at the moment
pip3 install mat2
apt-get install gir1.2-poppler-0.18
diff --git a/app/Ingest/AbstractConvertor.php b/app/Ingest/AbstractConvertor.php
new file mode 100644
index 0000000..bd06f20
--- /dev/null
+++ b/app/Ingest/AbstractConvertor.php
@@ -0,0 +1,24 @@
+storage = $storage;
+ $this->path = $path;
+ $this->directoryPath = pathinfo($path, PATHINFO_DIRNAME);
+ }
+
+ abstract public function execute();
+
+ protected function deleteOriginalDocument()
+ {
+ $this->storage->delete($this->path);
+ }
+}
diff --git a/app/Ingest/Convertor.php b/app/Ingest/Convertor.php
index d6bbce7..8a204af 100644
--- a/app/Ingest/Convertor.php
+++ b/app/Ingest/Convertor.php
@@ -5,7 +5,6 @@ namespace App\Ingest;
use Illuminate\Support\Facades\Storage;
use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process;
-use League\HTMLToMarkdown\HtmlConverter;
class Convertor
{
@@ -30,191 +29,18 @@ class Convertor
public function execute()
{
if ($this->type === 'txt') {
- return $this->path;
+ $convertor = new TextConvertor($this->storage, $this->path);
+ } else if ($this->type === 'pdf') {
+ $convertor = new PDFConvertor($this->storage, $this->path);
+ } else if ($this->type === 'docx') {
+ $convertor = new DocxConvertor($this->storage, $this->path);
+ } else {
+ $convertor = new OtherConvertor($this->storage, $this->path);
}
- if ($this->type === 'pdf') {
-// $this->convertPdfToText();
- $this->convertPdfToMD();
-// $this->getHtmlContentsFromPdfWithImages();
+ $convertor->execute();
- return $this->path;
- }
-
- if ($this->type !== 'docx') {
- $this->convertToDocx();
- }
-
- $this->convertDocumentToText();
//$this->convertToHtml();
-
- return $this->path;
- }
-
- /**
- * Convert doc,dot,rtf,odt,pdf,docx to docx
- *
- *
- * @return string|void
- */
- private function convertToDocx()
- {
- (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
-
- /**
- * Convert doc,dot,rtf,odt to docx
- */
- $process = new Process([
- 'soffice',
- '--headless',
- '--convert-to',
- 'docx',
- $this->storage->path($this->path),
- '--outdir',
- $this->storage->path('contracts')
- ]);
-
- $process->run();
-
- if (!$process->isSuccessful()) {
- throw new ProcessFailedException($process);
- }
-
- $this->storage->delete($this->path);
-
- $this->path = str_replace(".$this->type", '.docx', $this->path);
- }
-
- /**
- * Convert docx file to text
- *
- * @return void
- */
- private function convertDocumentToText()
- {
- (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
-
- $process = new Process([
- 'soffice',
- '--headless',
- '--convert-to',
- 'txt',
- $this->storage->path($this->path),
- '--outdir',
- $this->storage->path('contracts')
- ]);
-
- $process->run();
-
- if (!$process->isSuccessful()) {
- throw new ProcessFailedException($process);
- }
-
- $this->storage->delete($this->path);
-
- $this->path = str_replace(['.docx', '.bin'], '.txt', $this->path);
- }
-
- protected function convertPdfToText()
- {
- $this->prepareForConvertPDF();
-
- $images = $this->getImagesFromPDF();
-
- $contents = $this->getTextContentsFromPDF();
-
- if (!$contents && count($images) === 0) {
- throw new \Exception('Could not read from file.');
- }
-
- // Handle images and image contents.
- if (count($images) > 0) {
- foreach ($images as $image) {
- try {
- $ocr = new OCR($this->storage->path($image));
-
- $imageContents = $ocr->execute();
-
- $contents = $contents . "\n" . $imageContents;
- } catch (\Exception $exception) {
- \Illuminate\Support\Facades\Log::info('something wrong: ' . $exception->getMessage());
- }
- }
-
- $dir = str_replace('.pdf', '', $this->path);
-
- $this->storage->deleteDirectory($dir);
- }
-
- $this->storage->delete($this->path);
-
- $this->path = str_replace('.pdf', '.txt', $this->path);
-
- $this->storage->put($this->path, $contents);
- }
-
- protected function convertPdfToMD()
- {
-// $this->prepareForConvertPDF();
-
- $result = $this->getContentsFromPdf();
-
- if ( ! $result['has_images'] && ! $result['has_text']) {
- throw new \Exception('Cannot get pdf file contents.');
- }
-
- if ($result['has_text']) {
- if ($result['has_images']) {
- // Both text and images.
- throw new \Exception('Not supported for now.');
- }
-
- // Delete directory because the contents are in the '$result' variable.
- $this->storage->deleteDirectory($this->path);
-
- $mdContents = '';
-
- foreach ($result['htmls'] as $html) {
- $converter = new HtmlConverter();
- $converter->getConfig()->setOption('strip_tags', true);
-
- $contents = $converter->convert($html);
-
- $mdContents = $mdContents . $contents;
- }
-
- $this->path = "$this->path.md";
-
- $this->storage->put($this->path, $mdContents);
-
- return;
- }
-
- // Only contains images.
- $imagesContent = '';
- $files = $this->storage->allFiles($this->path);
-
- foreach ($files as $file) {
- // Only get the image files from the directory, it may contain some empty html files too.
- if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
- $ocr = new OCR($this->storage->path($file));
-
- $imagesContent = $imagesContent . $ocr->execute();
- }
- }
-
- \Illuminate\Support\Facades\Log::info('============================');
- \Illuminate\Support\Facades\Log::info($this->path);
-
- // We are done with the images processing, delete directory.
- $this->storage->deleteDirectory($this->path);
-
- $this->path = "$this->path.md";
-
- \Illuminate\Support\Facades\Log::info($this->path);
- \Illuminate\Support\Facades\Log::info('++++++++++++++++++++++++++');
-
- $this->storage->put($this->path, $imagesContent);
}
private function convertToHtml()
@@ -262,248 +88,4 @@ class Convertor
$this->path = str_replace(".$this->type", '.xml', $this->path);
}
-
- protected function prepareForConvertPDF()
- {
- (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
-
- $process = new Process([
- 'pip3',
- 'install',
- 'pdftotext',
- ]);
-
- $process->run();
-
- if (!$process->isSuccessful()) {
- throw new ProcessFailedException($process);
- }
- }
-
- protected function getImagesFromPDF()
- {
- $dir = str_replace('.pdf', '', $this->path);
-
- $this->storage->makeDirectory($dir);
-
- $process = new Process([
- 'pdfimages',
- '-p',
- $this->storage->path($this->path),
- '-tiff',
- $this->storage->path("$dir/ocr")
- ]);
-
- $process->run();
-
- if (!$process->isSuccessful()) {
- throw new ProcessFailedException($process);
- }
-
- return $this->storage->allFiles($dir);
- }
-
- protected function getTextContentsFromPDF()
- {
- $outputPath = $this->storage->path(str_replace('.pdf', '.txt', $this->path));
-
- $process = new Process([
- 'python3',
- storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'),
- '-i',
- $this->storage->path($this->path),
- '-o',
- $outputPath
- ]);
-
- $process->run();
-
- if (!$process->isSuccessful()) {
- throw new ProcessFailedException($process);
- }
-
- return file_get_contents($outputPath);
- }
-
- protected function getHtmlContentsFromPdfWithImages()
- {
- $dirName = str_replace('.pdf', '', $this->path);
- $this->storage->makeDirectory($dirName);
-
- $outputPath = $this->storage->path("$dirName/html");
-
- $process = new Process([
- 'pdftohtml',
- '-noframes',
- $this->storage->path($this->path),
- $outputPath
- ]);
-
- $process->run();
-
- if (!$process->isSuccessful()) {
- throw new ProcessFailedException($process);
- }
-
- $this->storage->delete($this->path);
-
- $this->path = $dirName;
-
- $converter = new HtmlConverter();
- $converter->getConfig()->setOption('strip_tags', true);
-
- $files = $this->storage->allFiles($this->path);
-
- $htmlFileIndex = null;
-
- foreach ($files as $index => $file) {
- // if (pathinfo($file, PATHINFO_BASENAME) === 'html-html.html') {
- // if (pathinfo($file, PATHINFO_EXTENSION) === 'html') {
- if (pathinfo($file, PATHINFO_BASENAME) === 'html.html') {
- $htmlFileIndex = $index;
-
- break;
- }
- }
-
- $htmlContents = $this->storage->get($files[$htmlFileIndex]);
- $contents = $converter->convert($htmlContents);
-
-// $this->storage->deleteDirectory($this->path);
-
- $this->path = "$this->path.md";
-
- $this->storage->put($this->path, $contents);
-
- dd(3);
- }
-
- protected function getContentsFromPdf()
- {
- $dirName = str_replace('.pdf', '', $this->path);
- $this->storage->makeDirectory($dirName);
-
- $outputPath = $this->storage->path("$dirName/html");
-
- $process = new Process([
- 'pdftohtml',
- '-xml',
- $this->storage->path($this->path),
- $outputPath
- ]);
-
- $process->run();
-
- if (!$process->isSuccessful()) {
- throw new ProcessFailedException($process);
- }
-
- $this->storage->delete($this->path);
-
- $this->path = $dirName;
-
- $contents = $this->storage->get("$this->path/html.xml");
-
- $xml = simplexml_load_string($contents);
-
- $fonts = [];
-
- foreach ($xml->page as $page) {
- foreach ($page as $p) {
- if ($p->getName() === 'fontspec') {
- $fonts[(int) $p['id']]['family'] = (string) $p['family'];
- $fonts[(int) $p['id']]['size'] = (string) $p['size'];
- $fonts[(int) $p['id']]['color'] = (string) $p['color'];
- }
- }
- }
-
- $htmls = [];
- $hasImages = false;
- $hasText = false;
-
- try {
- foreach ($xml->page as $page) {
- $html = '';
-
- $previousP = null;
-
- foreach ($page as $p) {
- if ($p->getName() == 'image') {
- $html = $html . '';
-
- $hasImages = true;
- }
-
- if ($p->getName() == 'text') {
- $id = (int) $p['font'];
- $font_size = $fonts[$id]['size'];
- $font_color = $fonts[$id]['color'];
- $font_family = $fonts[$id]['family'];
-
- $style = '';
- $style = $style . 'position: absolute;';
- $style = $style . "color: $font_color;";
- $style = $style . "font-family: $font_family;";
- $style = $style . "font-weight: 900;";
- $style = $style . "width: " . $p['width'] . "px;";
- $style = $style . "height: " . $p['height'] . "px;";
- $style = $style . "top: " . $p['top'] . "px;";
- $style = $style . "left: " . $p['left'] . "px;";
-
-// $style = $style . "font-size: $font_size" . "px;";
-
- if ($p->i) {
- $content = '' . $p->i . '';
- } else if ($p->b) {
- $content = '' . $p->b . '';
- } else {
- $content = $p;
- }
-
- // @TODO Must chain paragraphs if top are almost same.
-
- $tag = $this->getTag($p, $previousP, $font_size);
-
- $html = $html . '<' . $tag . ' style="' . $style . '">' . $content . '' . $tag . '>';
-
- $hasText = true;
- }
-
- $previousP = $p;
- }
-
- $htmls[] = '
' . $html . '';
- }
- } catch (\Exception $exception) {
- \Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
- }
-
- return [
- 'has_images' => $hasImages,
- 'has_text' => $hasText,
- 'htmls' => $htmls,
- ];
- }
-
- protected function getTag($p, $previousP, $size)
- {
- if ($size > 24) {
- return 'h1';
- }
-
- if ($size > 18) {
- return 'h2';
- }
-
- if ($size > 16) {
- return 'h3';
- }
-
- if ($previousP && $p['top'] - $previousP['top'] <= 5) {
- return 'span';
- }
-
- return 'p';
- }
}
diff --git a/app/Ingest/DocumentHandler.php b/app/Ingest/DocumentHandler.php
index 25b4963..38e4dfe 100644
--- a/app/Ingest/DocumentHandler.php
+++ b/app/Ingest/DocumentHandler.php
@@ -53,8 +53,10 @@ class DocumentHandler
$type = $this->supportedFiles[$mimeType];
- $path = $storage->putFileAs("contracts", $file, "$this->id.$type");
+ $id = str_replace(' ', '_', $this->id);
- IngestDocuments::dispatch($path, $type);
+ $path = $storage->putFileAs("contracts/$id", $file, "document.$type");
+
+ IngestDocuments::dispatch($this->id, $path, $type);
}
}
diff --git a/app/Ingest/DocxConvertor.php b/app/Ingest/DocxConvertor.php
new file mode 100644
index 0000000..421a408
--- /dev/null
+++ b/app/Ingest/DocxConvertor.php
@@ -0,0 +1,46 @@
+convertToText();
+
+ $convertor = new TextConvertor($this->storage, "$this->directoryPath/document.txt");
+
+ $convertor->execute();
+ }
+
+ /**
+ * Convert docx file to text
+ *
+ * @return void
+ */
+ protected function convertToText()
+ {
+ (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
+
+ $process = new Process([
+ 'soffice',
+ '--headless',
+ '--convert-to',
+ 'txt',
+ $this->storage->path($this->path),
+ '--outdir',
+ $this->storage->path($this->directoryPath)
+ ]);
+
+ $process->run();
+
+ if (!$process->isSuccessful()) {
+ throw new ProcessFailedException($process);
+ }
+
+ $this->deleteOriginalDocument();
+ }
+}
diff --git a/app/Ingest/OtherConvertor.php b/app/Ingest/OtherConvertor.php
new file mode 100644
index 0000000..53f6839
--- /dev/null
+++ b/app/Ingest/OtherConvertor.php
@@ -0,0 +1,50 @@
+convertToDocx();
+
+ $convertor = new DocxConvertor($this->storage, "$this->directoryPath/document.docx");
+
+ $convertor->execute();
+ }
+
+ /**
+ * Convert doc,dot,rtf,odt,pdf,docx to docx
+ *
+ *
+ * @return string|void
+ */
+ private function convertToDocx()
+ {
+ (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
+
+ /**
+ * Convert doc,dot,rtf,odt to docx
+ */
+ $process = new Process([
+ 'soffice',
+ '--headless',
+ '--convert-to',
+ 'docx',
+ $this->storage->path($this->path),
+ '--outdir',
+ $this->storage->path($this->directoryPath)
+ ]);
+
+ $process->run();
+
+ if (!$process->isSuccessful()) {
+ throw new ProcessFailedException($process);
+ }
+
+ $this->deleteOriginalDocument();
+ }
+}
diff --git a/app/Ingest/PDFConvertor.php b/app/Ingest/PDFConvertor.php
new file mode 100644
index 0000000..2b86d7e
--- /dev/null
+++ b/app/Ingest/PDFConvertor.php
@@ -0,0 +1,271 @@
+prepareForConvertPDF();
+
+ $result = $this->getFileContents();
+
+ if ( ! $result['has_images'] && ! $result['has_text']) {
+ throw new \Exception('Cannot get pdf file contents.');
+ }
+
+ if ($result['has_text']) {
+ $mdContents = '';
+
+ foreach ($result['htmls'] as $html) {
+ $converter = new HtmlConverter();
+ $converter->getConfig()->setOption('strip_tags', true);
+
+ $contents = $converter->convert($html);
+
+ $mdContents = $mdContents . "\n\n" . $contents;
+ }
+
+ $this->storage->put("$this->directoryPath/document.md", $mdContents);
+
+ return;
+ }
+
+ // Only contains images.
+ $imagesContent = '';
+ $files = $this->storage->allFiles($this->path);
+
+ foreach ($files as $file) {
+ // Only get the image files from the directory, it may contain some empty html files too.
+
+ // @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep.
+ if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
+ $ocr = new OCR($this->storage->path($file));
+
+ $imagesContent = $imagesContent . $ocr->execute();
+
+ $this->storage->delete($file);
+ }
+ }
+
+ $this->storage->put("$this->directoryPath/document.md", $imagesContent);
+ }
+
+ protected function getFileContents()
+ {
+ $outputPath = $this->storage->path("$this->directoryPath/html");
+
+ $process = new Process([
+ 'pdftohtml',
+ '-xml',
+ $this->storage->path($this->path),
+ $outputPath
+ ]);
+
+ $process->run();
+
+ if (!$process->isSuccessful()) {
+ throw new ProcessFailedException($process);
+ }
+
+ // Remove original document.
+ $this->storage->delete($this->path);
+
+ return $this->getDataFromXML();
+ }
+
+ protected function getDataFromXML()
+ {
+ $xmlFilePath = "$this->directoryPath/html.xml";
+
+ $contents = $this->storage->get($xmlFilePath);
+
+ $xml = simplexml_load_string($contents);
+
+ $orderedList = [];
+ $fonts = [];
+
+ foreach ($xml->page as $page) {
+ $pageNumber = (int) $page['number'][0];
+
+ $orderedList[$pageNumber] = [];
+
+ foreach ($page as $p) {
+ if ($p->getName() === 'fontspec') {
+ $fonts[(int) $p['id']]['family'] = (string) $p['family'];
+ $fonts[(int) $p['id']]['size'] = (string) $p['size'];
+ $fonts[(int) $p['id']]['color'] = (string) $p['color'];
+ }
+
+ if (isset($p['top'])) {
+ $top = (int) $p['top'];
+
+ if ( ! array_key_exists($top, $orderedList[$pageNumber])) {
+ $orderedList[$pageNumber][$top] = [];
+ }
+
+ $orderedList[$pageNumber][$top][] = $p;
+ }
+ }
+
+ ksort($orderedList[$pageNumber]);
+ }
+
+ $htmls = [];
+ $hasImages = false;
+ $hasText = false;
+
+ $imagesCount = 0;
+ $imagesInFooter = true;
+
+ try {
+ foreach ($orderedList as $page) {
+ $html = '';
+ $footerImages = [];
+
+ foreach ($page as $items) {
+ $continuousP = '';
+
+ foreach ($items as $p) {
+ if ($p->getName() == 'image') {
+ $hasImages = true;
+
+ $imagesCount += 1;
+ $caption = "Fig. $imagesCount";
+
+ $imageHTML = $this->handleImage($p, $caption);
+
+ if ( ! $imagesInFooter) {
+ $html = $html . $imageHTML;
+ } else {
+ $html = $html . " $caption
";
+
+ $footerImages[] = $imageHTML;
+ }
+ }
+
+ if ($p->getName() == 'text') {
+ $continuousP = $continuousP . $this->handleText($p, $fonts);
+
+ $hasText = true;
+ }
+ }
+
+ $html = $html . '' . $continuousP . '
';
+ }
+
+ if ($imagesInFooter) {
+ foreach ($footerImages as $index => $footerImage) {
+ $html = $html . '' . $footerImage . '
';
+// $html = $html . ' Fig. ' . ($index + 1) . '
';
+ }
+ }
+
+ $htmls[] = '' . $html . '';
+ }
+ } catch (\Exception $exception) {
+ $this->storage->deleteDirectory($this->directoryPath);
+
+ \Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
+
+ throw new \Exception('Something went wrong.');
+ }
+
+ if ( ! $hasText && ! $hasImages) {
+ // Remove directory because we do not have any use for it anymore.
+ $this->storage->deleteDirectory($this->directoryPath);
+ } else {
+ // Remove the unnecessary 'xml' file.
+ $this->storage->delete($xmlFilePath);
+ }
+
+ return [
+ 'has_images' => $hasImages,
+ 'has_text' => $hasText,
+ 'htmls' => $htmls,
+ ];
+ }
+
+ protected function handleImage($p, $caption)
+ {
+ $html = '';
+
+ $src = './' . pathinfo($p['src'], PATHINFO_BASENAME);
+
+ $html = $html . '
';
+ $html = $html . '';
+ $html = $html . '
';
+ $html = $html . '
';
+
+ return $html;
+ }
+
+ protected function handleText($p, $fonts)
+ {
+ $id = (int) $p['font'];
+ $font_size = $fonts[$id]['size'];
+ $font_color = $fonts[$id]['color'];
+ $font_family = $fonts[$id]['family'];
+
+ $style = '';
+ $style = $style . 'position: absolute;';
+ $style = $style . "color: $font_color;";
+ $style = $style . "font-family: $font_family;";
+ $style = $style . "font-weight: 900;";
+ $style = $style . "width: " . $p['width'] . "px;";
+ $style = $style . "height: " . $p['height'] . "px;";
+ $style = $style . "top: " . $p['top'] . "px;";
+ $style = $style . "left: " . $p['left'] . "px;";
+ $style = $style . "font-size: $font_size" . "px;";
+
+ if ($p->i) {
+ $content = '' . $p->i . '';
+ } else if ($p->b) {
+ $content = '' . $p->b . '';
+ } else {
+ $content = $p;
+ }
+
+ $tag = $this->getTag($font_size);
+
+ return '<' . $tag . ' style="' . $style . '">' . $content . '' . $tag . '>';
+ }
+
+ protected function getTag($size)
+ {
+ if ($size > 24) {
+ return 'h1';
+ }
+
+ if ($size > 18) {
+ return 'h2';
+ }
+
+ if ($size > 16) {
+ return 'h3';
+ }
+
+ return 'span';
+ }
+
+ protected function prepareForConvertPDF()
+ {
+ (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
+
+ $process = new Process([
+ 'pip3',
+ 'install',
+ 'pdftotext',
+ ]);
+
+ $process->run();
+
+ if (!$process->isSuccessful()) {
+ throw new ProcessFailedException($process);
+ }
+ }
+}
diff --git a/app/Ingest/TextConvertor.php b/app/Ingest/TextConvertor.php
new file mode 100644
index 0000000..5630d6a
--- /dev/null
+++ b/app/Ingest/TextConvertor.php
@@ -0,0 +1,52 @@
+fromFile($this->storage->path($this->path));
+
+ if ( ! $content) {
+ throw new \Exception('Could not read content.');
+ }
+
+ $content = $this->convertToUTF8($content);
+
+ $this->storeContent($content);
+ }
+
+ protected function convertToUTF8($content)
+ {
+ array_walk_recursive(
+ $content,
+ function (&$entry) {
+ $entry = mb_convert_encoding(
+ $entry,
+ 'UTF-8'
+ );
+ }
+ );
+
+ return $content;
+ }
+
+ protected function storeContent($content)
+ {
+ $this->storeMD($content);
+
+ $this->deleteOriginalDocument();
+ }
+
+ protected function storeMD($content)
+ {
+ $convertor = new MDConvertor($content);
+
+ $this->storage->put("$this->directoryPath/document.md", $convertor->execute());
+ }
+}
diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php
index 7720ccb..1c08f87 100644
--- a/app/Jobs/IngestDocuments.php
+++ b/app/Jobs/IngestDocuments.php
@@ -3,12 +3,10 @@
namespace App\Jobs;
use App\Ingest\Convertor;
-use App\Ingest\MDConvertor;
use App\Parser\ParseXml;
use App\Parser\DocxParser\ParseDocx;
use App\Parser\HtmlParser\ParseHtml;
use App\Parser\ParseHtmlArray;
-use App\Parser\ParseTextArray;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
@@ -20,6 +18,7 @@ class IngestDocuments implements ShouldQueue
{
use Dispatchable, InteractsWithQueue, Queueable;
+ protected $id;
private $path;
protected $type;
@@ -51,11 +50,13 @@ class IngestDocuments implements ShouldQueue
/**
* Create a new job instance.
*
+ * @param $id
* @param string $path
* @param $type
*/
- public function __construct(string $path, $type)
+ public function __construct($id, string $path, $type)
{
+ $this->id = $id;
$this->path = $path;
$this->type = $type;
@@ -76,7 +77,7 @@ class IngestDocuments implements ShouldQueue
$convertor = new Convertor($this->path, $this->type);
try {
- $this->path = $convertor->execute();
+ $convertor->execute();
} catch (\Exception $exception) {
\Illuminate\Support\Facades\Log::info($exception->getMessage());
@@ -85,30 +86,7 @@ class IngestDocuments implements ShouldQueue
return;
}
- // @TODO Replace later, the convertor will create the .md file.
- if ($this->type !== 'pdf') {
- $content = $this->getContent();
-
- if ( ! $content) {
- $this->failed();
-
- return;
- }
-
- $content = $this->convertToUTF8($content);
-
- try {
- $filePath = $this->storeContent($content);
- } catch (\Exception $e) {
- Log::error('Error writing in to the file: ' . $e->getMessage());
-
-// report($e);
- }
- } else {
- $filePath = $this->path;
- }
-
- SendToCore::dispatch($filePath);
+ SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME));
}
public function failed()
@@ -124,73 +102,6 @@ class IngestDocuments implements ShouldQueue
// $this->storage->delete($this->path);
// }
- SendToCore::dispatch($this->path, true);
- }
-
- protected function getContent()
- {
- if ($this->type === 'pdf') {
- // Wait while it finishes.
- while (!$this->storage->exists($this->path)) {
- sleep(1);
- }
-
- $textParser = new ParseTextArray(true);
-
- return $textParser->fromFile($this->storage->path($this->path));
- }
-
- $textParser = new ParseTextArray();
-
- return $textParser->fromFile($this->storage->path($this->path));
- }
-
- protected function convertToUTF8($content)
- {
- array_walk_recursive(
- $content,
- function (&$entry) {
- $entry = mb_convert_encoding(
- $entry,
- 'UTF-8'
- );
- }
- );
-
- return $content;
- }
-
- protected function storeContent($content)
- {
- $result = explode('.', $this->path);
- $name = $result[0];
-
- // Or json?
- $filePath = $this->storeMD($name, $content);
-
- // Delete converted file. We now have the .md file.
- $this->storage->delete($this->path);
-
- return $filePath;
- }
-
- protected function storeMD($name, $content)
- {
- $fileName = "$name.md";
-
- $convertor = new MDConvertor($content);
-
- $this->storage->put($fileName, $convertor->execute());
-
- return $fileName;
- }
-
- protected function storeJson($name, $content)
- {
- $fileName = "$name.json";
-
- $this->storage->put($fileName, $content);
-
- return $fileName;
+ SendToCore::dispatch($this->id, pathinfo($this->path, PATHINFO_DIRNAME), true);
}
}
diff --git a/app/Jobs/SendToCore.php b/app/Jobs/SendToCore.php
index d1acd3e..b6d7779 100644
--- a/app/Jobs/SendToCore.php
+++ b/app/Jobs/SendToCore.php
@@ -18,7 +18,7 @@ class SendToCore implements ShouldQueue
private $secret;
- private $filePath;
+ private $directoryPath;
private $id;
@@ -32,19 +32,18 @@ class SendToCore implements ShouldQueue
/**
* Create a new job instance.
*
- * @param null $filePath
+ * @param $id
+ * @param null $directoryPath
* @param bool $hasFailed
*/
- public function __construct($filePath = null, $hasFailed = false)
+ public function __construct($id, $directoryPath = null, $hasFailed = false)
{
$this->url = env('WEBHOOK_CORE_URL') . '/webhooks';
$this->secret = env('WEBHOOK_CORE_SECRET');
- $this->filePath = $filePath;
- $this->hasFailed = $hasFailed;
- $string = str_replace('contracts/', '', $this->filePath);
- $result = explode('.', $string);
- $this->id = $result[0];
+ $this->id = $id;
+ $this->directoryPath = $directoryPath;
+ $this->hasFailed = $hasFailed;
}
/**
@@ -55,70 +54,99 @@ class SendToCore implements ShouldQueue
*/
public function handle()
{
- $content = '';
+ $content = [];
- // File exists, send content.
- if ($this->filePath && ! $this->hasFailed) {
+ // Directory exists, send content.
+ if ($this->directoryPath && ! $this->hasFailed) {
$this->storage = Storage::disk('local');
// @TODO Check if the file exists multiple times?
- if ( ! $this->storage->exists($this->filePath)) {
+ if ( ! $this->storage->exists($this->directoryPath)) {
throw new \Exception('File does not exist yet.');
}
- $content = $this->storage->get($this->filePath);
+ $content = $this->getContent();
}
$sent = $this->sendTheData($content);
-// if ($this->filePath && $sent) {
- if ($this->filePath) {
+// if ($this->directoryPath && $sent) {
+ if ($this->directoryPath) {
if ( ! $this->storage) {
$this->storage = Storage::disk('local');
}
- $this->storage->delete($this->filePath);
+ $this->storage->deleteDirectory($this->directoryPath);
}
}
public function failed()
{
- if ($this->filePath) {
+ if ($this->directoryPath) {
if ( ! $this->storage) {
$this->storage = Storage::disk('local');
}
- $this->storage->delete($this->filePath);
+ $this->storage->delete($this->directoryPath);
}
}
/**
- * Send the data to the core trough webhooks
+ * Send the data to the core through webhooks
*
* @param $content
* @return bool
*/
- protected function sendTheData($content)
+ protected function sendTheData(array $content)
{
try {
WebhookCall::create()
->url($this->url)
->payload(['data' => [
'id' => $this->id,
- 'content' => $this->encodeContent($content),
- 'status' => $content ? 'success' : 'fail',
+ 'content' => $content,
+ 'status' => count($content) > 0 ? 'success' : 'fail',
]])
->useSecret($this->secret)
->dispatch();
return true;
} catch (\Exception $exception) {
- Log::error('SendToCore@sendTheData' . $exception->getMessage());
+ Log::error('SendToCore@sendTheData: ' . $exception->getMessage());
return false;
}
}
+ protected function getContent()
+ {
+ $document = $this->storage->get("$this->directoryPath/document.md");
+ $document = $this->encodeContent($document);
+
+ $images = [];
+
+ $allFiles = $this->storage->allFiles($this->directoryPath);
+
+ foreach ($allFiles as $file) {
+ // @TODO We are using this check in the 'PDFConvertor' file, refactor and improve.
+ if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
+ $name = pathinfo($file, PATHINFO_FILENAME);
+ $type = pathinfo($file, PATHINFO_EXTENSION);
+
+ $images[] = [
+ 'name' => $name,
+ 'type' => $type,
+ 'contents' => 'data:image/' . $type . ';base64,' . base64_encode($this->storage->get($file)),
+ ];
+ }
+ }
+
+ return [
+ 'document' => $document,
+ 'images' => $images,
+ ];
+ }
+
protected function encodeContent($content)
{
$encoding = mb_detect_encoding($content, 'UTF-8, ISO-8859-1, WINDOWS-1252, WINDOWS-1251', true);