You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
273 lines
8.0 KiB
273 lines
8.0 KiB
<?php
|
|
|
|
namespace App\Ingest;
|
|
|
|
use League\HTMLToMarkdown\HtmlConverter;
|
|
use Symfony\Component\Process\Exception\ProcessFailedException;
|
|
use Symfony\Component\Process\Process;
|
|
|
|
class PDFConvertor extends AbstractConvertor
|
|
{
|
|
public function execute()
|
|
{
|
|
// $this->prepareForConvertPDF();
|
|
|
|
$result = $this->getFileContents();
|
|
|
|
if ( ! $result['has_images'] && ! $result['has_text']) {
|
|
throw new \Exception('Cannot get pdf file contents.');
|
|
}
|
|
|
|
if ($result['has_text']) {
|
|
$mdContents = '';
|
|
|
|
foreach ($result['htmls'] as $html) {
|
|
$converter = new HtmlConverter();
|
|
$converter->getConfig()->setOption('strip_tags', true);
|
|
|
|
$contents = $converter->convert($html);
|
|
|
|
$mdContents = $mdContents . "\n\n" . $contents;
|
|
}
|
|
|
|
$this->storage->put("$this->directoryPath/document.md", $mdContents);
|
|
|
|
return;
|
|
}
|
|
|
|
// Only contains images.
|
|
$imagesContent = '';
|
|
$files = $this->storage->allFiles($this->path);
|
|
|
|
foreach ($files as $file) {
|
|
// Only get the image files from the directory, it may contain some empty html files too.
|
|
|
|
// @TODO Only OCR images with text and delete them afterwards, the remaining ignore and keep.
|
|
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) {
|
|
$ocr = new OCR($this->storage->path($file));
|
|
|
|
$imagesContent = $imagesContent . $ocr->execute();
|
|
|
|
$this->storage->delete($file);
|
|
}
|
|
}
|
|
|
|
$this->storage->put("$this->directoryPath/document.md", $imagesContent);
|
|
}
|
|
|
|
protected function getFileContents()
|
|
{
|
|
$outputPath = $this->storage->path("$this->directoryPath/html");
|
|
|
|
$process = new Process([
|
|
'pdftohtml',
|
|
'-xml',
|
|
$this->storage->path($this->path),
|
|
$outputPath
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
// Remove original document.
|
|
$this->storage->delete($this->path);
|
|
|
|
return $this->getDataFromXML();
|
|
}
|
|
|
|
protected function getDataFromXML()
|
|
{
|
|
$xmlFilePath = "$this->directoryPath/html.xml";
|
|
|
|
$contents = $this->storage->get($xmlFilePath);
|
|
|
|
$xml = simplexml_load_string($contents);
|
|
|
|
$orderedList = [];
|
|
$fonts = [];
|
|
|
|
foreach ($xml->page as $page) {
|
|
$pageNumber = (int) $page['number'][0];
|
|
|
|
$orderedList[$pageNumber] = [];
|
|
|
|
foreach ($page as $p) {
|
|
if ($p->getName() === 'fontspec') {
|
|
$fonts[(int) $p['id']]['family'] = (string) $p['family'];
|
|
$fonts[(int) $p['id']]['size'] = (string) $p['size'];
|
|
$fonts[(int) $p['id']]['color'] = (string) $p['color'];
|
|
}
|
|
|
|
if (isset($p['top'])) {
|
|
$top = (int) $p['top'];
|
|
|
|
if ( ! array_key_exists($top, $orderedList[$pageNumber])) {
|
|
$orderedList[$pageNumber][$top] = [];
|
|
}
|
|
|
|
$orderedList[$pageNumber][$top][] = $p;
|
|
}
|
|
}
|
|
|
|
ksort($orderedList[$pageNumber]);
|
|
}
|
|
|
|
$htmls = [];
|
|
$hasImages = false;
|
|
$hasText = false;
|
|
|
|
$imagesCount = 0;
|
|
$imagesInFooter = true;
|
|
|
|
try {
|
|
foreach ($orderedList as $page) {
|
|
$html = '';
|
|
$footerImages = [];
|
|
|
|
foreach ($page as $items) {
|
|
$continuousP = '';
|
|
|
|
foreach ($items as $p) {
|
|
if ($p->getName() == 'image') {
|
|
$hasImages = true;
|
|
|
|
$imagesCount += 1;
|
|
$caption = "Fig. $imagesCount";
|
|
|
|
$imageHTML = $this->handleImage($p, $caption);
|
|
|
|
if ( ! $imagesInFooter) {
|
|
$html = $html . $imageHTML;
|
|
} else {
|
|
$html = $html . "<p> $caption </p>";
|
|
|
|
$footerImages[] = $imageHTML;
|
|
}
|
|
}
|
|
|
|
if ($p->getName() == 'text') {
|
|
$continuousP = $continuousP . $this->handleText($p, $fonts);
|
|
|
|
$hasText = true;
|
|
}
|
|
}
|
|
|
|
$html = $html . '<p>' . $continuousP . '</p>';
|
|
}
|
|
|
|
if ($imagesInFooter) {
|
|
foreach ($footerImages as $index => $footerImage) {
|
|
$html = $html . '<p>' . $footerImage . '</p>';
|
|
// $html = $html . '<p> Fig. ' . ($index + 1) . '</p>';
|
|
}
|
|
}
|
|
|
|
$htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>';
|
|
}
|
|
} catch (\Exception $exception) {
|
|
$this->storage->deleteDirectory($this->directoryPath);
|
|
|
|
\Illuminate\Support\Facades\Log::info($exception->getTraceAsString());
|
|
|
|
throw new \Exception('Something went wrong.');
|
|
}
|
|
|
|
if ( ! $hasText && ! $hasImages) {
|
|
// Remove directory because we do not have any use for it anymore.
|
|
$this->storage->deleteDirectory($this->directoryPath);
|
|
} else {
|
|
// Remove the unnecessary 'xml' file.
|
|
$this->storage->delete($xmlFilePath);
|
|
}
|
|
|
|
return [
|
|
'has_images' => $hasImages,
|
|
'has_text' => $hasText,
|
|
'htmls' => $htmls,
|
|
];
|
|
}
|
|
|
|
protected function handleImage($p, $caption)
|
|
{
|
|
$html = '';
|
|
|
|
$src = './' . pathinfo($p['src'], PATHINFO_BASENAME);
|
|
|
|
$html = $html . '<br>';
|
|
$html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
|
|
$html = $html . '<br>';
|
|
$html = $html . '<br>';
|
|
|
|
return $html;
|
|
}
|
|
|
|
protected function handleText($p, $fonts)
|
|
{
|
|
$id = (int) $p['font'];
|
|
$font_size = $fonts[$id]['size'];
|
|
$font_color = $fonts[$id]['color'];
|
|
$font_family = $fonts[$id]['family'];
|
|
|
|
$style = '';
|
|
$style = $style . 'position: absolute;';
|
|
$style = $style . "color: $font_color;";
|
|
$style = $style . "font-family: $font_family;";
|
|
$style = $style . "font-weight: 900;";
|
|
$style = $style . "width: " . $p['width'] . "px;";
|
|
$style = $style . "height: " . $p['height'] . "px;";
|
|
$style = $style . "top: " . $p['top'] . "px;";
|
|
$style = $style . "left: " . $p['left'] . "px;";
|
|
$style = $style . "font-size: $font_size" . "px;";
|
|
|
|
if ($p->i) {
|
|
$content = '<i>' . $p->i . '</i>';
|
|
} else if ($p->b) {
|
|
$content = '<b>' . $p->b . '</b>';
|
|
} else {
|
|
$content = $p;
|
|
}
|
|
|
|
$tag = $this->getTag($font_size);
|
|
|
|
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
|
|
}
|
|
|
|
protected function getTag($size)
|
|
{
|
|
// @TODO Needed to bump values up by 2, the XML loader gives different results on different servers.
|
|
|
|
if ($size > 26) {
|
|
return 'h1';
|
|
}
|
|
|
|
if ($size > 20) {
|
|
return 'h2';
|
|
}
|
|
|
|
if ($size > 18) {
|
|
return 'h3';
|
|
}
|
|
|
|
return 'span';
|
|
}
|
|
|
|
protected function prepareForConvertPDF()
|
|
{
|
|
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
|
|
|
|
$process = new Process([
|
|
'pip3',
|
|
'install',
|
|
'pdftotext',
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
}
|
|
}
|