You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
314 lines
9.3 KiB
314 lines
9.3 KiB
<?php
|
|
|
|
namespace App\Ingest;
|
|
|
|
use Symfony\Component\Process\Exception\ProcessFailedException;
|
|
use Symfony\Component\Process\Process;
|
|
|
|
use Illuminate\Support\Facades\Log;
|
|
|
|
use thiagoalessio\TesseractOCR\TesseractOcrException;
|
|
|
|
class PDFConvertor extends AbstractConvertor
|
|
{
|
|
public function execute()
|
|
{
|
|
// $this->prepareForConvertPDF();
|
|
|
|
$contents = $this->getFileContents();
|
|
|
|
if ( ! $contents) {
|
|
throw new \Exception('Cannot get pdf file contents.');
|
|
}
|
|
|
|
$this->storage->put("$this->directoryPath/document.html", $contents);
|
|
}
|
|
|
|
protected function getFileContents()
|
|
{
|
|
$outputPath = $this->storage->path("$this->directoryPath/html");
|
|
|
|
$process = new Process([
|
|
'pdftohtml',
|
|
'-xml',
|
|
$this->storage->path($this->path),
|
|
$outputPath
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
// Remove original document.
|
|
$this->storage->delete($this->path);
|
|
|
|
return $this->getDataFromXML();
|
|
}
|
|
|
|
protected function getDataFromXML()
|
|
{
|
|
$xmlFilePath = "$this->directoryPath/html.xml";
|
|
|
|
$contents = $this->storage->get($xmlFilePath);
|
|
|
|
$xml = simplexml_load_string($contents);
|
|
|
|
$orderedList = [];
|
|
$fonts = [];
|
|
|
|
foreach ($xml->page as $page) {
|
|
$pageNumber = (int) $page['number'][0];
|
|
|
|
$orderedList[$pageNumber] = [];
|
|
|
|
foreach ($page as $p) {
|
|
if ($p->getName() === 'fontspec') {
|
|
$fonts[(int) $p['id']]['family'] = (string) substr($p['family'], strpos($p['family'], '+') + 1);
|
|
$fonts[(int) $p['id']]['size'] = (string) $p['size'];
|
|
$fonts[(int) $p['id']]['color'] = (string) $p['color'];
|
|
}
|
|
|
|
if (isset($p['top'])) {
|
|
$top = (int) $p['top'];
|
|
|
|
if ( ! array_key_exists($top, $orderedList[$pageNumber])) {
|
|
$orderedList[$pageNumber][$top] = [];
|
|
}
|
|
|
|
$orderedList[$pageNumber][$top][] = $p;
|
|
}
|
|
}
|
|
|
|
ksort($orderedList[$pageNumber]);
|
|
}
|
|
|
|
$hasImages = false;
|
|
$hasText = false;
|
|
|
|
$imagesCount = 0;
|
|
|
|
$htmlContents = '';
|
|
|
|
try {
|
|
foreach ($orderedList as $page) {
|
|
$html = '';
|
|
$footerImages = [];
|
|
|
|
foreach ($page as $items) {
|
|
$continuousP = '';
|
|
$firstOfText = true;
|
|
|
|
foreach ($items as $key => $p) {
|
|
if ($p->getName() == 'image') {
|
|
$imageInFooter = false;
|
|
$basePath = $this->storage->path('');
|
|
$imageFilePath = str_replace($basePath, '', $p['src']);
|
|
|
|
try {
|
|
$textContents = $this->applyOCR($imageFilePath);
|
|
} catch (TesseractOcrException $e) {
|
|
# Could not get text content from image. This means the image doesn't have text.
|
|
$textContents = '';
|
|
}
|
|
|
|
if ($textContents) {
|
|
$imageInFooter = true;
|
|
if ($html) {
|
|
$htmlContents = $htmlContents . $html;
|
|
|
|
$html = '';
|
|
}
|
|
|
|
$htmlContents = $htmlContents . "<div>$textContents</div>";
|
|
|
|
$this->storage->delete($imageFilePath);
|
|
|
|
$hasText = true;
|
|
} else {
|
|
$hasImages = true;
|
|
|
|
$imagesCount += 1;
|
|
$caption = "Fig. $imagesCount";
|
|
|
|
$imageHTML = $this->handleImage($p, $caption);
|
|
|
|
if (!$imageInFooter) {
|
|
$html = $html . $imageHTML;
|
|
} else {
|
|
$html = $html . "<p> $caption </p>";
|
|
|
|
$footerImages[] = $imageHTML;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($p->getName() == 'text') {
|
|
if($p == '·') {
|
|
continue;
|
|
}
|
|
|
|
$addition = null;
|
|
if(isset($items[$key-1]) && $items[$key-1] == '·') {
|
|
$addition = '· ';
|
|
}
|
|
|
|
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText);
|
|
|
|
$firstOfText = false;
|
|
$hasText = true;
|
|
}
|
|
}
|
|
|
|
$html = $html . '<p>' . $continuousP . '</p>';
|
|
}
|
|
|
|
if (!empty($footerImages)) {
|
|
foreach ($footerImages as $footerImage) {
|
|
$html = $html . '<p>' . $footerImage . '</p>';
|
|
}
|
|
}
|
|
|
|
$htmlContents = $htmlContents . "<html><head></head><body>$html</body></html>";
|
|
}
|
|
} catch (\Exception $exception) {
|
|
$this->storage->deleteDirectory($this->directoryPath);
|
|
|
|
Log::info($exception->getTraceAsString());
|
|
|
|
// throw new \Exception('Something went wrong.');
|
|
throw $exception;
|
|
}
|
|
|
|
if ( ! $hasText && ! $hasImages) {
|
|
// Remove directory because we do not have any use for it anymore.
|
|
$this->storage->deleteDirectory($this->directoryPath);
|
|
} else {
|
|
// Remove the unnecessary 'xml' file.
|
|
$this->storage->delete($xmlFilePath);
|
|
}
|
|
|
|
// return $mdContents;
|
|
return $htmlContents;
|
|
}
|
|
|
|
protected function handleImage($p, $caption)
|
|
{
|
|
$html = '';
|
|
|
|
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME);
|
|
|
|
$html = $html . '<br>';
|
|
$html = $html . '<img width=' . $p['width'] . ' ' . 'height=' . $p['height'] . ' src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
|
|
$html = $html . '<br>';
|
|
$html = $html . '<br>';
|
|
|
|
return $html;
|
|
}
|
|
|
|
protected function handleText($p, $fonts, $addition = null, $firstOfText = false)
|
|
{
|
|
$id = (int) $p['font'];
|
|
$font_size = $fonts[$id]['size'];
|
|
$font_color = $fonts[$id]['color'];
|
|
$font_family = $fonts[$id]['family'];
|
|
|
|
$style = '';
|
|
$style = $style . 'position: relative;';
|
|
$style = $style . "color: $font_color;";
|
|
$style = $style . "font-family: $font_family;";
|
|
$style = $style . "height: " . $p['height'] . "px;";
|
|
if($firstOfText)
|
|
$style = $style . "padding-left: " . (intval($p['left']) - 90) . "px;";
|
|
$style = $style . "font-size: $font_size" . "px;";
|
|
|
|
if ($p->i) {
|
|
$content = '<i>' . $p->i . '</i>';
|
|
} else if ($p->b) {
|
|
$content = '<b>' . $p->b . '</b>';
|
|
} else if ($p->a) {
|
|
$content = $p . '<a href="' . $p->a . '">' . $p->a . '</a>';
|
|
} else {
|
|
$content = $p;
|
|
}
|
|
|
|
if($addition) {
|
|
$content = $addition . $content;
|
|
}
|
|
|
|
$tag = $this->getTag($font_size);
|
|
|
|
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
|
|
}
|
|
|
|
protected function getTag($size)
|
|
{
|
|
// @TODO Needed to bump values up by 2, the XML loader gives different results on different servers.
|
|
|
|
if ($size > 26) {
|
|
return 'h1';
|
|
}
|
|
|
|
if ($size > 20) {
|
|
return 'h2';
|
|
}
|
|
|
|
if ($size > 18) {
|
|
return 'h3';
|
|
}
|
|
|
|
return 'span';
|
|
}
|
|
|
|
protected function applyOCR($path)
|
|
{
|
|
$ocr = new OCR($this->storage->path($path));
|
|
|
|
return $ocr->execute();
|
|
}
|
|
|
|
protected function convertHtmlToMD($contents)
|
|
{
|
|
$html = '<html><head><title></title></head><body>' . $contents . '</body></html>';
|
|
$filepath = $this->storage->path($this->directoryPath);
|
|
|
|
file_put_contents($filepath . '/document.html', $html);
|
|
|
|
$process = new Process([
|
|
'pandoc',
|
|
'-f',
|
|
'html',
|
|
$filepath . '/document.html',
|
|
'-t',
|
|
'markdown_strict',
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
unlink($filepath . '/document.html');
|
|
|
|
return $process->getOutput();
|
|
}
|
|
|
|
protected function prepareForConvertPDF()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'pip3',
|
|
'install',
|
|
'pdftotext',
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
}
|
|
}
|