From 0fcac762040a2a77da9b9b9ab60a4c3c8bd76ca8 Mon Sep 17 00:00:00 2001 From: Radu Liviu Carjan Date: Thu, 20 Oct 2022 18:15:33 +0300 Subject: [PATCH] Fix ingest for PDF documents --- app/Ingest/PDFConvertor.php | 83 ++++++++++++++++++++----------------- 1 file changed, 45 insertions(+), 38 deletions(-) diff --git a/app/Ingest/PDFConvertor.php b/app/Ingest/PDFConvertor.php index 29a266f..ed19530 100644 --- a/app/Ingest/PDFConvertor.php +++ b/app/Ingest/PDFConvertor.php @@ -5,6 +5,10 @@ namespace App\Ingest; use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Process; +use Illuminate\Support\Facades\Log; + +use thiagoalessio\TesseractOCR\TesseractOcrException; + class PDFConvertor extends AbstractConvertor { public function execute() @@ -17,7 +21,7 @@ class PDFConvertor extends AbstractConvertor throw new \Exception('Cannot get pdf file contents.'); } - $this->storage->put("$this->directoryPath/document.json", json_encode($contents)); + $this->storage->put("$this->directoryPath/document.html", $contents); } protected function getFileContents() @@ -85,10 +89,7 @@ class PDFConvertor extends AbstractConvertor $imagesCount = 0; - $mdContents = ''; $htmlContents = ''; - $json = []; - $i = 0; try { foreach ($orderedList as $page) { @@ -98,26 +99,28 @@ class PDFConvertor extends AbstractConvertor foreach ($page as $items) { $continuousP = ''; $firstOfText = true; - + foreach ($items as $key => $p) { if ($p->getName() == 'image') { $imageInFooter = false; $basePath = $this->storage->path(''); $imageFilePath = str_replace($basePath, '', $p['src']); - // $textContents = $this->applyOCR($imageFilePath); @@ uncomment - $textContents = null; // remove this + try { + $textContents = $this->applyOCR($imageFilePath); + } catch (TesseractOcrException $e) { + # Could not get text content from image. This means the image doesn't have text. + $textContents = ''; + } - if (false) { + if ($textContents) { $imageInFooter = true; if ($html) { - $mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n"; $htmlContents = $htmlContents . $html; $html = ''; } - $mdContents = $mdContents . $textContents . "\n\n"; $htmlContents = $htmlContents . "
$textContents
"; $this->storage->delete($imageFilePath); @@ -129,17 +132,14 @@ class PDFConvertor extends AbstractConvertor $imagesCount += 1; $caption = "Fig. $imagesCount"; - $imageJSON = $this->handleImage($p, $caption); + $imageHTML = $this->handleImage($p, $caption); if (!$imageInFooter) { - $json[$i]['tag'] = 'img'; - $json[$i]['src'] = $imageJSON['src']; - $json[$i]['style'] = $imageJSON['style']; - $json[$i]['details'] = $caption; + $html = $html . $imageHTML; } else { $html = $html . "

$caption

"; - $footerImages[] = $imageJSON; + $footerImages[] = $imageHTML; } } } @@ -148,33 +148,37 @@ class PDFConvertor extends AbstractConvertor if($p == '·') { continue; } - + $addition = null; if(isset($items[$key-1]) && $items[$key-1] == '·') { $addition = '· '; } - - $continuousP = $this->handleText($p, $fonts, $addition, $firstOfText, true); - if($firstOfText) { - $json[$i]['tag'] = $continuousP['tag']; - $json[$i]['style'] = $continuousP['style']; - } - (isset($json[$i]['content'])) ? $json[$i]['content'] = $json[$i]['content'] . $continuousP['content'] : $json[$i]['content'] = $continuousP['content']; + $continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText); $firstOfText = false; $hasText = true; } } - $i++; + + $html = $html . '

' . $continuousP . '

'; + } + + if (!empty($footerImages)) { + foreach ($footerImages as $footerImage) { + $html = $html . '

' . $footerImage . '

'; + } } + + $htmlContents = $htmlContents . "$html"; } } catch (\Exception $exception) { $this->storage->deleteDirectory($this->directoryPath); - \Illuminate\Support\Facades\Log::info($exception->getTraceAsString()); + Log::info($exception->getTraceAsString()); - throw new \Exception('Something went wrong.'); + // throw new \Exception('Something went wrong.'); + throw $exception; } if ( ! $hasText && ! $hasImages) { @@ -185,15 +189,22 @@ class PDFConvertor extends AbstractConvertor $this->storage->delete($xmlFilePath); } - return $json; + // return $mdContents; + return $htmlContents; } - protected function handleImage($p) + protected function handleImage($p, $caption) { - return [ - 'src' => pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME), - 'style' => 'width="' . $p['width'] . 'px" ' . 'height="' . $p['height'] . 'px"' - ]; + $html = ''; + + $src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME); + + $html = $html . '
'; + $html = $html . '' . $caption . ''; + $html = $html . '
'; + $html = $html . '
'; + + return $html; } protected function handleText($p, $fonts, $addition = null, $firstOfText = false) @@ -228,11 +239,7 @@ class PDFConvertor extends AbstractConvertor $tag = $this->getTag($font_size); - return [ - 'tag' => $tag, - 'content' => (string) $content, - 'style' => $style - ]; + return '<' . $tag . ' style="' . $style . '">' . $content . ''; } protected function getTag($size)