|
|
<?php
namespace App\Ingest;
use League\HTMLToMarkdown\HtmlConverter; use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Process;
class PDFConvertor extends AbstractConvertor { public function execute() { // $this->prepareForConvertPDF();
$result = $this->getFileContents();
if ( ! $result['has_images'] && ! $result['has_text']) { throw new \Exception('Cannot get pdf file contents.'); }
if ($result['has_text']) { $mdContents = '';
foreach ($result['htmls'] as $html) { $converter = new HtmlConverter(); $converter->getConfig()->setOption('strip_tags', true);
$contents = $converter->convert($html);
$mdContents = $mdContents . "\n\n" . $contents; }
$this->storage->put("$this->directoryPath/document.md", $mdContents);
return; }
// Only contains images.
$imagesContent = ''; $files = $this->storage->allFiles($this->path);
foreach ($files as $file) { // Only get the image files from the directory, it may contain some empty html files too.
// @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep.
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { $ocr = new OCR($this->storage->path($file));
$imagesContent = $imagesContent . $ocr->execute();
$this->storage->delete($file); } }
$this->storage->put("$this->directoryPath/document.md", $imagesContent); }
protected function getFileContents() { $outputPath = $this->storage->path("$this->directoryPath/html");
$process = new Process([ 'pdftohtml', '-xml', $this->storage->path($this->path), $outputPath ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
// Remove original document.
$this->storage->delete($this->path);
return $this->getDataFromXML(); }
protected function getDataFromXML() { $xmlFilePath = "$this->directoryPath/html.xml";
$contents = $this->storage->get($xmlFilePath);
$xml = simplexml_load_string($contents);
$orderedList = []; $fonts = [];
foreach ($xml->page as $page) { $pageNumber = (int) $page['number'][0];
$orderedList[$pageNumber] = [];
foreach ($page as $p) { if ($p->getName() === 'fontspec') { $fonts[(int) $p['id']]['family'] = (string) $p['family']; $fonts[(int) $p['id']]['size'] = (string) $p['size']; $fonts[(int) $p['id']]['color'] = (string) $p['color']; }
if (isset($p['top'])) { $top = (int) $p['top'];
if ( ! array_key_exists($top, $orderedList[$pageNumber])) { $orderedList[$pageNumber][$top] = []; }
$orderedList[$pageNumber][$top][] = $p; } }
ksort($orderedList[$pageNumber]); }
$htmls = []; $hasImages = false; $hasText = false;
$imagesCount = 0; $imagesInFooter = true;
try { foreach ($orderedList as $page) { $html = ''; $footerImages = [];
foreach ($page as $items) { $continuousP = '';
foreach ($items as $p) { if ($p->getName() == 'image') { $hasImages = true;
$imagesCount += 1; $caption = "Fig. $imagesCount";
$imageHTML = $this->handleImage($p, $caption);
if ( ! $imagesInFooter) { $html = $html . $imageHTML; } else { $html = $html . "<p> $caption </p>";
$footerImages[] = $imageHTML; } }
if ($p->getName() == 'text') { $continuousP = $continuousP . $this->handleText($p, $fonts);
$hasText = true; } }
$html = $html . '<p>' . $continuousP . '</p>'; }
if ($imagesInFooter) { foreach ($footerImages as $index => $footerImage) { $html = $html . '<p>' . $footerImage . '</p>'; // $html = $html . '<p> Fig. ' . ($index + 1) . '</p>';
} }
$htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>'; } } catch (\Exception $exception) { $this->storage->deleteDirectory($this->directoryPath);
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
throw new \Exception('Something went wrong.'); }
if ( ! $hasText && ! $hasImages) { // Remove directory because we do not have any use for it anymore.
$this->storage->deleteDirectory($this->directoryPath); } else { // Remove the unnecessary 'xml' file.
$this->storage->delete($xmlFilePath); }
return [ 'has_images' => $hasImages, 'has_text' => $hasText, 'htmls' => $htmls, ]; }
protected function handleImage($p, $caption) { $html = '';
$src = './' . pathinfo($p['src'], PATHINFO_BASENAME);
$html = $html . '<br>'; $html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">'; $html = $html . '<br>'; $html = $html . '<br>';
return $html; }
protected function handleText($p, $fonts) { $id = (int) $p['font']; $font_size = $fonts[$id]['size']; $font_color = $fonts[$id]['color']; $font_family = $fonts[$id]['family'];
$style = ''; $style = $style . 'position: absolute;'; $style = $style . "color: $font_color;"; $style = $style . "font-family: $font_family;"; $style = $style . "font-weight: 900;"; $style = $style . "width: " . $p['width'] . "px;"; $style = $style . "height: " . $p['height'] . "px;"; $style = $style . "top: " . $p['top'] . "px;"; $style = $style . "left: " . $p['left'] . "px;"; $style = $style . "font-size: $font_size" . "px;";
if ($p->i) { $content = '<i>' . $p->i . '</i>'; } else if ($p->b) { $content = '<b>' . $p->b . '</b>'; } else { $content = $p; }
$tag = $this->getTag($font_size);
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>'; }
protected function getTag($size) { // @TODO Needed to bump values up by 2, the XML loader gives different results on different servers.
if ($size > 26) { return 'h1'; }
if ($size > 20) { return 'h2'; }
if ($size > 18) { return 'h3'; }
return 'span'; }
protected function prepareForConvertPDF() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ 'pip3', 'install', 'pdftotext', ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } } }
|