You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
275 lines
8.2 KiB
275 lines
8.2 KiB
<?php
|
|
|
|
namespace App\Ingest;
|
|
|
|
use League\HTMLToMarkdown\HtmlConverter;
|
|
use Symfony\Component\Process\Exception\ProcessFailedException;
|
|
use Symfony\Component\Process\Process;
|
|
|
|
class PDFConvertor extends AbstractConvertor
|
|
{
|
|
public function execute()
|
|
{
|
|
// $this->prepareForConvertPDF();
|
|
|
|
$contents = $this->getFileContents();
|
|
|
|
if ( ! $contents) {
|
|
throw new \Exception('Cannot get pdf file contents.');
|
|
}
|
|
|
|
$this->storage->put("$this->directoryPath/document.md", $contents);
|
|
}
|
|
|
|
protected function getFileContents()
|
|
{
|
|
$outputPath = $this->storage->path("$this->directoryPath/html");
|
|
|
|
$process = new Process([
|
|
'pdftohtml',
|
|
'-xml',
|
|
$this->storage->path($this->path),
|
|
$outputPath
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
// Remove original document.
|
|
$this->storage->delete($this->path);
|
|
|
|
return $this->getDataFromXML();
|
|
}
|
|
|
|
protected function getDataFromXML()
|
|
{
|
|
$xmlFilePath = "$this->directoryPath/html.xml";
|
|
|
|
$contents = $this->storage->get($xmlFilePath);
|
|
|
|
$xml = simplexml_load_string($contents);
|
|
|
|
$orderedList = [];
|
|
$fonts = [];
|
|
|
|
foreach ($xml->page as $page) {
|
|
$pageNumber = (int) $page['number'][0];
|
|
|
|
$orderedList[$pageNumber] = [];
|
|
|
|
foreach ($page as $p) {
|
|
if ($p->getName() === 'fontspec') {
|
|
$fonts[(int) $p['id']]['family'] = (string) $p['family'];
|
|
$fonts[(int) $p['id']]['size'] = (string) $p['size'];
|
|
$fonts[(int) $p['id']]['color'] = (string) $p['color'];
|
|
}
|
|
|
|
if (isset($p['top'])) {
|
|
$top = (int) $p['top'];
|
|
|
|
if ( ! array_key_exists($top, $orderedList[$pageNumber])) {
|
|
$orderedList[$pageNumber][$top] = [];
|
|
}
|
|
|
|
$orderedList[$pageNumber][$top][] = $p;
|
|
}
|
|
}
|
|
|
|
ksort($orderedList[$pageNumber]);
|
|
}
|
|
|
|
$hasImages = false;
|
|
$hasText = false;
|
|
|
|
$imagesCount = 0;
|
|
$imagesInFooter = true;
|
|
|
|
$mdContents = '';
|
|
$htmlContents = '';
|
|
|
|
try {
|
|
foreach ($orderedList as $page) {
|
|
$html = '';
|
|
$footerImages = [];
|
|
|
|
foreach ($page as $items) {
|
|
$continuousP = '';
|
|
|
|
foreach ($items as $p) {
|
|
if ($p->getName() == 'image') {
|
|
$basePath = $this->storage->path('');
|
|
$imageFilePath = str_replace($basePath, '', $p['src']);
|
|
|
|
$textContents = $this->applyOCR($imageFilePath);
|
|
|
|
if ($textContents) {
|
|
if ($html) {
|
|
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
|
|
$htmlContents = $htmlContents . $html;
|
|
|
|
$html = '';
|
|
}
|
|
|
|
$mdContents = $mdContents . $textContents . "\n\n";
|
|
$htmlContents = $htmlContents . "<div>$textContents</div>";
|
|
|
|
$this->storage->delete($imageFilePath);
|
|
|
|
$hasText = true;
|
|
} else {
|
|
$hasImages = true;
|
|
|
|
$imagesCount += 1;
|
|
$caption = "Fig. $imagesCount";
|
|
|
|
$imageHTML = $this->handleImage($p, $caption);
|
|
|
|
if ( ! $imagesInFooter) {
|
|
$html = $html . $imageHTML;
|
|
} else {
|
|
$html = $html . "<p> $caption </p>";
|
|
|
|
$footerImages[] = $imageHTML;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($p->getName() == 'text') {
|
|
$continuousP = $continuousP . $this->handleText($p, $fonts);
|
|
|
|
$hasText = true;
|
|
}
|
|
}
|
|
|
|
$html = $html . '<p>' . $continuousP . '</p>';
|
|
}
|
|
|
|
if ($imagesInFooter) {
|
|
foreach ($footerImages as $index => $footerImage) {
|
|
$html = $html . '<p>' . $footerImage . '</p>';
|
|
}
|
|
}
|
|
|
|
$mdContents = $mdContents . $this->convertHtmlToMD($html) . "\n\n";
|
|
$htmlContents = $htmlContents . "<html><head></head><body>$html</body></html>";
|
|
}
|
|
} catch (\Exception $exception) {
|
|
$this->storage->deleteDirectory($this->directoryPath);
|
|
|
|
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
|
|
|
|
throw new \Exception('Something went wrong.');
|
|
}
|
|
|
|
if ( ! $hasText && ! $hasImages) {
|
|
// Remove directory because we do not have any use for it anymore.
|
|
$this->storage->deleteDirectory($this->directoryPath);
|
|
} else {
|
|
// Remove the unnecessary 'xml' file.
|
|
$this->storage->delete($xmlFilePath);
|
|
}
|
|
|
|
return $mdContents;
|
|
}
|
|
|
|
protected function handleImage($p, $caption)
|
|
{
|
|
$html = '';
|
|
|
|
$src = './' . pathinfo($p['src'], PATHINFO_BASENAME);
|
|
|
|
$html = $html . '<br>';
|
|
$html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
|
|
$html = $html . '<br>';
|
|
$html = $html . '<br>';
|
|
|
|
return $html;
|
|
}
|
|
|
|
protected function handleText($p, $fonts)
|
|
{
|
|
$id = (int) $p['font'];
|
|
$font_size = $fonts[$id]['size'];
|
|
$font_color = $fonts[$id]['color'];
|
|
$font_family = $fonts[$id]['family'];
|
|
|
|
$style = '';
|
|
$style = $style . 'position: absolute;';
|
|
$style = $style . "color: $font_color;";
|
|
$style = $style . "font-family: $font_family;";
|
|
$style = $style . "font-weight: 900;";
|
|
$style = $style . "width: " . $p['width'] . "px;";
|
|
$style = $style . "height: " . $p['height'] . "px;";
|
|
$style = $style . "top: " . $p['top'] . "px;";
|
|
$style = $style . "left: " . $p['left'] . "px;";
|
|
$style = $style . "font-size: $font_size" . "px;";
|
|
|
|
if ($p->i) {
|
|
$content = '<i>' . $p->i . '</i>';
|
|
} else if ($p->b) {
|
|
$content = '<b>' . $p->b . '</b>';
|
|
} else {
|
|
$content = $p;
|
|
}
|
|
|
|
$tag = $this->getTag($font_size);
|
|
|
|
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
|
|
}
|
|
|
|
protected function getTag($size)
|
|
{
|
|
// @TODO Needed to bump values up by 2, the XML loader gives different results on different servers.
|
|
|
|
if ($size > 26) {
|
|
return 'h1';
|
|
}
|
|
|
|
if ($size > 20) {
|
|
return 'h2';
|
|
}
|
|
|
|
if ($size > 18) {
|
|
return 'h3';
|
|
}
|
|
|
|
return 'span';
|
|
}
|
|
|
|
protected function applyOCR($path)
|
|
{
|
|
$ocr = new OCR($this->storage->path($path));
|
|
|
|
return $ocr->execute();
|
|
}
|
|
|
|
protected function convertHtmlToMD($contents)
|
|
{
|
|
$html = '<html><head><title></title></head><body>' . $contents . '</body></html>';
|
|
|
|
$converter = new HtmlConverter();
|
|
$converter->getConfig()->setOption('strip_tags', true);
|
|
|
|
return $converter->convert($html);
|
|
}
|
|
|
|
protected function prepareForConvertPDF()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'pip3',
|
|
'install',
|
|
'pdftotext',
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
}
|
|
}
|