Browse Source

improvements: display links, images style, remove break after header dots.

dev
Alex Puiu 2 years ago
parent
commit
ba9085d5bd
  1. 52
      app/Ingest/PDFConvertor.php
  2. 1
      composer.json

52
app/Ingest/PDFConvertor.php

@ -2,7 +2,6 @@
namespace App\Ingest;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process;
@ -98,7 +97,7 @@ class PDFConvertor extends AbstractConvertor
foreach ($page as $items) {
$continuousP = '';
foreach ($items as $p) {
foreach ($items as $key => $p) {
if ($p->getName() == 'image') {
$basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']);
@ -138,7 +137,16 @@ class PDFConvertor extends AbstractConvertor
}
if ($p->getName() == 'text') {
$continuousP = $continuousP . $this->handleText($p, $fonts);
if($p == '·') {
continue;
}
$addition = null;
if(isset($items[$key-1]) && $items[$key-1] == '·') {
$addition = '· ';
}
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition);
$hasText = true;
}
@ -179,17 +187,17 @@ class PDFConvertor extends AbstractConvertor
{
$html = '';
$src = './' . pathinfo($p['src'], PATHINFO_BASENAME);
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME);
$html = $html . '<br>';
$html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
$html = $html . '<img style="position: relative; width:' . $p['width'] . 'px; height:' . $p['height'] . 'px;" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
$html = $html . '<br>';
$html = $html . '<br>';
return $html;
}
protected function handleText($p, $fonts)
protected function handleText($p, $fonts, $addition = null)
{
$id = (int) $p['font'];
$font_size = $fonts[$id]['size'];
@ -197,7 +205,7 @@ class PDFConvertor extends AbstractConvertor
$font_family = $fonts[$id]['family'];
$style = '';
$style = $style . 'position: absolute;';
$style = $style . 'position: relative;';
$style = $style . "color: $font_color;";
$style = $style . "font-family: $font_family;";
$style = $style . "font-weight: 900;";
@ -211,10 +219,16 @@ class PDFConvertor extends AbstractConvertor
$content = '<i>' . $p->i . '</i>';
} else if ($p->b) {
$content = '<b>' . $p->b . '</b>';
} else if ($p->a) {
$content = $p . '<a>' . $p->a . '</a>';
} else {
$content = $p;
}
if($addition) {
$content = $addition . $content;
}
$tag = $this->getTag($font_size);
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
@ -236,7 +250,7 @@ class PDFConvertor extends AbstractConvertor
return 'h3';
}
return 'span';
return 'p';
}
protected function applyOCR($path)
@ -249,11 +263,27 @@ class PDFConvertor extends AbstractConvertor
protected function convertHtmlToMD($contents)
{
$html = '<html><head><title></title></head><body>' . $contents . '</body></html>';
$filepath = $this->storage->path($this->directoryPath);
file_put_contents($filepath . '/document.html', $html);
$converter = new HtmlConverter();
$converter->getConfig()->setOption('strip_tags', true);
$process = new Process([
'pandoc',
'-f',
'html',
$filepath . '/document.html',
'-t',
'markdown_strict',
]);
$process->run();
return $converter->convert($html);
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
}
return $process->getOutput();
}
protected function prepareForConvertPDF()

1
composer.json

@ -13,7 +13,6 @@
"fideloper/proxy": "^4.0",
"laravel/framework": "^6.2",
"laravel/tinker": "^2.0",
"league/html-to-markdown": "^5.0",
"phpoffice/phpword": "^0.17.0",
"predis/predis": "^1.1",
"spatie/laravel-webhook-server": "^1.13",

Loading…
Cancel
Save