Browse Source

Fix ingest for PDF documents

master
Radu Liviu Carjan 2 years ago
parent
commit
0fcac76204
  1. 83
      app/Ingest/PDFConvertor.php

83
app/Ingest/PDFConvertor.php

@ -5,6 +5,10 @@ namespace App\Ingest;
use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process; use Symfony\Component\Process\Process;
use Illuminate\Support\Facades\Log;
use thiagoalessio\TesseractOCR\TesseractOcrException;
class PDFConvertor extends AbstractConvertor class PDFConvertor extends AbstractConvertor
{ {
public function execute() public function execute()
@ -17,7 +21,7 @@ class PDFConvertor extends AbstractConvertor
throw new \Exception('Cannot get pdf file contents.'); throw new \Exception('Cannot get pdf file contents.');
} }
$this->storage->put("$this->directoryPath/document.json", json_encode($contents));
$this->storage->put("$this->directoryPath/document.html", $contents);
} }
protected function getFileContents() protected function getFileContents()
@ -85,10 +89,7 @@ class PDFConvertor extends AbstractConvertor
$imagesCount = 0; $imagesCount = 0;
$mdContents = '';
$htmlContents = ''; $htmlContents = '';
$json = [];
$i = 0;
try { try {
foreach ($orderedList as $page) { foreach ($orderedList as $page) {
@ -98,26 +99,28 @@ class PDFConvertor extends AbstractConvertor
foreach ($page as $items) { foreach ($page as $items) {
$continuousP = ''; $continuousP = '';
$firstOfText = true; $firstOfText = true;
foreach ($items as $key => $p) { foreach ($items as $key => $p) {
if ($p->getName() == 'image') { if ($p->getName() == 'image') {
$imageInFooter = false; $imageInFooter = false;
$basePath = $this->storage->path(''); $basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']); $imageFilePath = str_replace($basePath, '', $p['src']);
// $textContents = $this->applyOCR($imageFilePath); @@ uncomment
$textContents = null; // remove this
try {
$textContents = $this->applyOCR($imageFilePath);
} catch (TesseractOcrException $e) {
# Could not get text content from image. This means the image doesn't have text.
$textContents = '';
}
if (false) {
if ($textContents) {
$imageInFooter = true; $imageInFooter = true;
if ($html) { if ($html) {
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
$htmlContents = $htmlContents . $html; $htmlContents = $htmlContents . $html;
$html = ''; $html = '';
} }
$mdContents = $mdContents . $textContents . "\n\n";
$htmlContents = $htmlContents . "<div>$textContents</div>"; $htmlContents = $htmlContents . "<div>$textContents</div>";
$this->storage->delete($imageFilePath); $this->storage->delete($imageFilePath);
@ -129,17 +132,14 @@ class PDFConvertor extends AbstractConvertor
$imagesCount += 1; $imagesCount += 1;
$caption = "Fig. $imagesCount"; $caption = "Fig. $imagesCount";
$imageJSON = $this->handleImage($p, $caption);
$imageHTML = $this->handleImage($p, $caption);
if (!$imageInFooter) { if (!$imageInFooter) {
$json[$i]['tag'] = 'img';
$json[$i]['src'] = $imageJSON['src'];
$json[$i]['style'] = $imageJSON['style'];
$json[$i]['details'] = $caption;
$html = $html . $imageHTML;
} else { } else {
$html = $html . "<p> $caption </p>"; $html = $html . "<p> $caption </p>";
$footerImages[] = $imageJSON;
$footerImages[] = $imageHTML;
} }
} }
} }
@ -148,33 +148,37 @@ class PDFConvertor extends AbstractConvertor
if($p == '·') { if($p == '·') {
continue; continue;
} }
$addition = null; $addition = null;
if(isset($items[$key-1]) && $items[$key-1] == '·') { if(isset($items[$key-1]) && $items[$key-1] == '·') {
$addition = '· '; $addition = '· ';
} }
$continuousP = $this->handleText($p, $fonts, $addition, $firstOfText, true);
if($firstOfText) {
$json[$i]['tag'] = $continuousP['tag'];
$json[$i]['style'] = $continuousP['style'];
}
(isset($json[$i]['content'])) ? $json[$i]['content'] = $json[$i]['content'] . $continuousP['content'] : $json[$i]['content'] = $continuousP['content'];
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText);
$firstOfText = false; $firstOfText = false;
$hasText = true; $hasText = true;
} }
} }
$i++;
$html = $html . '<p>' . $continuousP . '</p>';
}
if (!empty($footerImages)) {
foreach ($footerImages as $footerImage) {
$html = $html . '<p>' . $footerImage . '</p>';
}
} }
$htmlContents = $htmlContents . "<html><head></head><body>$html</body></html>";
} }
} catch (\Exception $exception) { } catch (\Exception $exception) {
$this->storage->deleteDirectory($this->directoryPath); $this->storage->deleteDirectory($this->directoryPath);
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
Log::info($exception->getTraceAsString());
throw new \Exception('Something went wrong.');
// throw new \Exception('Something went wrong.');
throw $exception;
} }
if ( ! $hasText && ! $hasImages) { if ( ! $hasText && ! $hasImages) {
@ -185,15 +189,22 @@ class PDFConvertor extends AbstractConvertor
$this->storage->delete($xmlFilePath); $this->storage->delete($xmlFilePath);
} }
return $json;
// return $mdContents;
return $htmlContents;
} }
protected function handleImage($p)
protected function handleImage($p, $caption)
{ {
return [
'src' => pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME),
'style' => 'width="' . $p['width'] . 'px" ' . 'height="' . $p['height'] . 'px"'
];
$html = '';
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME);
$html = $html . '<br>';
$html = $html . '<img width=' . $p['width'] . ' ' . 'height=' . $p['height'] . ' src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
$html = $html . '<br>';
$html = $html . '<br>';
return $html;
} }
protected function handleText($p, $fonts, $addition = null, $firstOfText = false) protected function handleText($p, $fonts, $addition = null, $firstOfText = false)
@ -228,11 +239,7 @@ class PDFConvertor extends AbstractConvertor
$tag = $this->getTag($font_size); $tag = $this->getTag($font_size);
return [
'tag' => $tag,
'content' => (string) $content,
'style' => $style
];
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
} }
protected function getTag($size) protected function getTag($size)

Loading…
Cancel
Save