|
|
@ -5,6 +5,10 @@ namespace App\Ingest; |
|
|
|
use Symfony\Component\Process\Exception\ProcessFailedException; |
|
|
|
use Symfony\Component\Process\Process; |
|
|
|
|
|
|
|
use Illuminate\Support\Facades\Log; |
|
|
|
|
|
|
|
use thiagoalessio\TesseractOCR\TesseractOcrException; |
|
|
|
|
|
|
|
class PDFConvertor extends AbstractConvertor |
|
|
|
{ |
|
|
|
public function execute() |
|
|
@ -17,7 +21,7 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
throw new \Exception('Cannot get pdf file contents.'); |
|
|
|
} |
|
|
|
|
|
|
|
$this->storage->put("$this->directoryPath/document.json", json_encode($contents)); |
|
|
|
$this->storage->put("$this->directoryPath/document.html", $contents); |
|
|
|
} |
|
|
|
|
|
|
|
protected function getFileContents() |
|
|
@ -85,10 +89,7 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
|
|
|
|
$imagesCount = 0; |
|
|
|
|
|
|
|
$mdContents = ''; |
|
|
|
$htmlContents = ''; |
|
|
|
$json = []; |
|
|
|
$i = 0; |
|
|
|
|
|
|
|
try { |
|
|
|
foreach ($orderedList as $page) { |
|
|
@ -98,26 +99,28 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
foreach ($page as $items) { |
|
|
|
$continuousP = ''; |
|
|
|
$firstOfText = true; |
|
|
|
|
|
|
|
|
|
|
|
foreach ($items as $key => $p) { |
|
|
|
if ($p->getName() == 'image') { |
|
|
|
$imageInFooter = false; |
|
|
|
$basePath = $this->storage->path(''); |
|
|
|
$imageFilePath = str_replace($basePath, '', $p['src']); |
|
|
|
|
|
|
|
// $textContents = $this->applyOCR($imageFilePath); @@ uncomment
|
|
|
|
$textContents = null; // remove this
|
|
|
|
try { |
|
|
|
$textContents = $this->applyOCR($imageFilePath); |
|
|
|
} catch (TesseractOcrException $e) { |
|
|
|
# Could not get text content from image. This means the image doesn't have text.
|
|
|
|
$textContents = ''; |
|
|
|
} |
|
|
|
|
|
|
|
if (false) { |
|
|
|
if ($textContents) { |
|
|
|
$imageInFooter = true; |
|
|
|
if ($html) { |
|
|
|
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n"; |
|
|
|
$htmlContents = $htmlContents . $html; |
|
|
|
|
|
|
|
$html = ''; |
|
|
|
} |
|
|
|
|
|
|
|
$mdContents = $mdContents . $textContents . "\n\n"; |
|
|
|
$htmlContents = $htmlContents . "<div>$textContents</div>"; |
|
|
|
|
|
|
|
$this->storage->delete($imageFilePath); |
|
|
@ -129,17 +132,14 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
$imagesCount += 1; |
|
|
|
$caption = "Fig. $imagesCount"; |
|
|
|
|
|
|
|
$imageJSON = $this->handleImage($p, $caption); |
|
|
|
$imageHTML = $this->handleImage($p, $caption); |
|
|
|
|
|
|
|
if (!$imageInFooter) { |
|
|
|
$json[$i]['tag'] = 'img'; |
|
|
|
$json[$i]['src'] = $imageJSON['src']; |
|
|
|
$json[$i]['style'] = $imageJSON['style']; |
|
|
|
$json[$i]['details'] = $caption; |
|
|
|
$html = $html . $imageHTML; |
|
|
|
} else { |
|
|
|
$html = $html . "<p> $caption </p>"; |
|
|
|
|
|
|
|
$footerImages[] = $imageJSON; |
|
|
|
$footerImages[] = $imageHTML; |
|
|
|
} |
|
|
|
} |
|
|
|
} |
|
|
@ -148,33 +148,37 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
if($p == '·') { |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
$addition = null; |
|
|
|
if(isset($items[$key-1]) && $items[$key-1] == '·') { |
|
|
|
$addition = '· '; |
|
|
|
} |
|
|
|
|
|
|
|
$continuousP = $this->handleText($p, $fonts, $addition, $firstOfText, true); |
|
|
|
if($firstOfText) { |
|
|
|
$json[$i]['tag'] = $continuousP['tag']; |
|
|
|
$json[$i]['style'] = $continuousP['style']; |
|
|
|
} |
|
|
|
|
|
|
|
(isset($json[$i]['content'])) ? $json[$i]['content'] = $json[$i]['content'] . $continuousP['content'] : $json[$i]['content'] = $continuousP['content']; |
|
|
|
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText); |
|
|
|
|
|
|
|
$firstOfText = false; |
|
|
|
$hasText = true; |
|
|
|
} |
|
|
|
} |
|
|
|
$i++; |
|
|
|
|
|
|
|
$html = $html . '<p>' . $continuousP . '</p>'; |
|
|
|
} |
|
|
|
|
|
|
|
if (!empty($footerImages)) { |
|
|
|
foreach ($footerImages as $footerImage) { |
|
|
|
$html = $html . '<p>' . $footerImage . '</p>'; |
|
|
|
} |
|
|
|
} |
|
|
|
|
|
|
|
$htmlContents = $htmlContents . "<html><head></head><body>$html</body></html>"; |
|
|
|
} |
|
|
|
} catch (\Exception $exception) { |
|
|
|
$this->storage->deleteDirectory($this->directoryPath); |
|
|
|
|
|
|
|
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString()); |
|
|
|
Log::info($exception->getTraceAsString()); |
|
|
|
|
|
|
|
throw new \Exception('Something went wrong.'); |
|
|
|
// throw new \Exception('Something went wrong.');
|
|
|
|
throw $exception; |
|
|
|
} |
|
|
|
|
|
|
|
if ( ! $hasText && ! $hasImages) { |
|
|
@ -185,15 +189,22 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
$this->storage->delete($xmlFilePath); |
|
|
|
} |
|
|
|
|
|
|
|
return $json; |
|
|
|
// return $mdContents;
|
|
|
|
return $htmlContents; |
|
|
|
} |
|
|
|
|
|
|
|
protected function handleImage($p) |
|
|
|
protected function handleImage($p, $caption) |
|
|
|
{ |
|
|
|
return [ |
|
|
|
'src' => pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME), |
|
|
|
'style' => 'width="' . $p['width'] . 'px" ' . 'height="' . $p['height'] . 'px"' |
|
|
|
]; |
|
|
|
$html = ''; |
|
|
|
|
|
|
|
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME); |
|
|
|
|
|
|
|
$html = $html . '<br>'; |
|
|
|
$html = $html . '<img width=' . $p['width'] . ' ' . 'height=' . $p['height'] . ' src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">'; |
|
|
|
$html = $html . '<br>'; |
|
|
|
$html = $html . '<br>'; |
|
|
|
|
|
|
|
return $html; |
|
|
|
} |
|
|
|
|
|
|
|
protected function handleText($p, $fonts, $addition = null, $firstOfText = false) |
|
|
@ -228,11 +239,7 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
|
|
|
|
$tag = $this->getTag($font_size); |
|
|
|
|
|
|
|
return [ |
|
|
|
'tag' => $tag, |
|
|
|
'content' => (string) $content, |
|
|
|
'style' => $style |
|
|
|
]; |
|
|
|
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>'; |
|
|
|
} |
|
|
|
|
|
|
|
protected function getTag($size) |
|
|
|