Browse Source

Fix ingest for PDF documents

master
Radu Liviu Carjan 2 years ago
parent
commit
0fcac76204
  1. 83
      app/Ingest/PDFConvertor.php

83
app/Ingest/PDFConvertor.php

@ -5,6 +5,10 @@ namespace App\Ingest;
use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process;
use Illuminate\Support\Facades\Log;
use thiagoalessio\TesseractOCR\TesseractOcrException;
class PDFConvertor extends AbstractConvertor
{
public function execute()
@ -17,7 +21,7 @@ class PDFConvertor extends AbstractConvertor
throw new \Exception('Cannot get pdf file contents.');
}
$this->storage->put("$this->directoryPath/document.json", json_encode($contents));
$this->storage->put("$this->directoryPath/document.html", $contents);
}
protected function getFileContents()
@ -85,10 +89,7 @@ class PDFConvertor extends AbstractConvertor
$imagesCount = 0;
$mdContents = '';
$htmlContents = '';
$json = [];
$i = 0;
try {
foreach ($orderedList as $page) {
@ -98,26 +99,28 @@ class PDFConvertor extends AbstractConvertor
foreach ($page as $items) {
$continuousP = '';
$firstOfText = true;
foreach ($items as $key => $p) {
if ($p->getName() == 'image') {
$imageInFooter = false;
$basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']);
// $textContents = $this->applyOCR($imageFilePath); @@ uncomment
$textContents = null; // remove this
try {
$textContents = $this->applyOCR($imageFilePath);
} catch (TesseractOcrException $e) {
# Could not get text content from image. This means the image doesn't have text.
$textContents = '';
}
if (false) {
if ($textContents) {
$imageInFooter = true;
if ($html) {
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
$htmlContents = $htmlContents . $html;
$html = '';
}
$mdContents = $mdContents . $textContents . "\n\n";
$htmlContents = $htmlContents . "<div>$textContents</div>";
$this->storage->delete($imageFilePath);
@ -129,17 +132,14 @@ class PDFConvertor extends AbstractConvertor
$imagesCount += 1;
$caption = "Fig. $imagesCount";
$imageJSON = $this->handleImage($p, $caption);
$imageHTML = $this->handleImage($p, $caption);
if (!$imageInFooter) {
$json[$i]['tag'] = 'img';
$json[$i]['src'] = $imageJSON['src'];
$json[$i]['style'] = $imageJSON['style'];
$json[$i]['details'] = $caption;
$html = $html . $imageHTML;
} else {
$html = $html . "<p> $caption </p>";
$footerImages[] = $imageJSON;
$footerImages[] = $imageHTML;
}
}
}
@ -148,33 +148,37 @@ class PDFConvertor extends AbstractConvertor
if($p == '·') {
continue;
}
$addition = null;
if(isset($items[$key-1]) && $items[$key-1] == '·') {
$addition = '· ';
}
$continuousP = $this->handleText($p, $fonts, $addition, $firstOfText, true);
if($firstOfText) {
$json[$i]['tag'] = $continuousP['tag'];
$json[$i]['style'] = $continuousP['style'];
}
(isset($json[$i]['content'])) ? $json[$i]['content'] = $json[$i]['content'] . $continuousP['content'] : $json[$i]['content'] = $continuousP['content'];
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText);
$firstOfText = false;
$hasText = true;
}
}
$i++;
$html = $html . '<p>' . $continuousP . '</p>';
}
if (!empty($footerImages)) {
foreach ($footerImages as $footerImage) {
$html = $html . '<p>' . $footerImage . '</p>';
}
}
$htmlContents = $htmlContents . "<html><head></head><body>$html</body></html>";
}
} catch (\Exception $exception) {
$this->storage->deleteDirectory($this->directoryPath);
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
Log::info($exception->getTraceAsString());
throw new \Exception('Something went wrong.');
// throw new \Exception('Something went wrong.');
throw $exception;
}
if ( ! $hasText && ! $hasImages) {
@ -185,15 +189,22 @@ class PDFConvertor extends AbstractConvertor
$this->storage->delete($xmlFilePath);
}
return $json;
// return $mdContents;
return $htmlContents;
}
protected function handleImage($p)
protected function handleImage($p, $caption)
{
return [
'src' => pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME),
'style' => 'width="' . $p['width'] . 'px" ' . 'height="' . $p['height'] . 'px"'
];
$html = '';
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME);
$html = $html . '<br>';
$html = $html . '<img width=' . $p['width'] . ' ' . 'height=' . $p['height'] . ' src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
$html = $html . '<br>';
$html = $html . '<br>';
return $html;
}
protected function handleText($p, $fonts, $addition = null, $firstOfText = false)
@ -228,11 +239,7 @@ class PDFConvertor extends AbstractConvertor
$tag = $this->getTag($font_size);
return [
'tag' => $tag,
'content' => (string) $content,
'style' => $style
];
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
}
protected function getTag($size)

Loading…
Cancel
Save