Browse Source

improvements: display links, images style, remove break after header dots.

dev
Alex Puiu 2 years ago
parent
commit
ba9085d5bd
  1. 52
      app/Ingest/PDFConvertor.php
  2. 1
      composer.json

52
app/Ingest/PDFConvertor.php

@ -2,7 +2,6 @@
namespace App\Ingest; namespace App\Ingest;
use League\HTMLToMarkdown\HtmlConverter;
use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process; use Symfony\Component\Process\Process;
@ -98,7 +97,7 @@ class PDFConvertor extends AbstractConvertor
foreach ($page as $items) { foreach ($page as $items) {
$continuousP = ''; $continuousP = '';
foreach ($items as $p) {
foreach ($items as $key => $p) {
if ($p->getName() == 'image') { if ($p->getName() == 'image') {
$basePath = $this->storage->path(''); $basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']); $imageFilePath = str_replace($basePath, '', $p['src']);
@ -138,7 +137,16 @@ class PDFConvertor extends AbstractConvertor
} }
if ($p->getName() == 'text') { if ($p->getName() == 'text') {
$continuousP = $continuousP . $this->handleText($p, $fonts);
if($p == '·') {
continue;
}
$addition = null;
if(isset($items[$key-1]) && $items[$key-1] == '·') {
$addition = '· ';
}
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition);
$hasText = true; $hasText = true;
} }
@ -179,17 +187,17 @@ class PDFConvertor extends AbstractConvertor
{ {
$html = ''; $html = '';
$src = './' . pathinfo($p['src'], PATHINFO_BASENAME);
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME);
$html = $html . '<br>'; $html = $html . '<br>';
$html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
$html = $html . '<img style="position: relative; width:' . $p['width'] . 'px; height:' . $p['height'] . 'px;" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
$html = $html . '<br>'; $html = $html . '<br>';
$html = $html . '<br>'; $html = $html . '<br>';
return $html; return $html;
} }
protected function handleText($p, $fonts)
protected function handleText($p, $fonts, $addition = null)
{ {
$id = (int) $p['font']; $id = (int) $p['font'];
$font_size = $fonts[$id]['size']; $font_size = $fonts[$id]['size'];
@ -197,7 +205,7 @@ class PDFConvertor extends AbstractConvertor
$font_family = $fonts[$id]['family']; $font_family = $fonts[$id]['family'];
$style = ''; $style = '';
$style = $style . 'position: absolute;';
$style = $style . 'position: relative;';
$style = $style . "color: $font_color;"; $style = $style . "color: $font_color;";
$style = $style . "font-family: $font_family;"; $style = $style . "font-family: $font_family;";
$style = $style . "font-weight: 900;"; $style = $style . "font-weight: 900;";
@ -211,10 +219,16 @@ class PDFConvertor extends AbstractConvertor
$content = '<i>' . $p->i . '</i>'; $content = '<i>' . $p->i . '</i>';
} else if ($p->b) { } else if ($p->b) {
$content = '<b>' . $p->b . '</b>'; $content = '<b>' . $p->b . '</b>';
} else if ($p->a) {
$content = $p . '<a>' . $p->a . '</a>';
} else { } else {
$content = $p; $content = $p;
} }
if($addition) {
$content = $addition . $content;
}
$tag = $this->getTag($font_size); $tag = $this->getTag($font_size);
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>'; return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
@ -236,7 +250,7 @@ class PDFConvertor extends AbstractConvertor
return 'h3'; return 'h3';
} }
return 'span';
return 'p';
} }
protected function applyOCR($path) protected function applyOCR($path)
@ -249,11 +263,27 @@ class PDFConvertor extends AbstractConvertor
protected function convertHtmlToMD($contents) protected function convertHtmlToMD($contents)
{ {
$html = '<html><head><title></title></head><body>' . $contents . '</body></html>'; $html = '<html><head><title></title></head><body>' . $contents . '</body></html>';
$filepath = $this->storage->path($this->directoryPath);
file_put_contents($filepath . '/document.html', $html);
$converter = new HtmlConverter();
$converter->getConfig()->setOption('strip_tags', true);
$process = new Process([
'pandoc',
'-f',
'html',
$filepath . '/document.html',
'-t',
'markdown_strict',
]);
$process->run();
return $converter->convert($html);
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
}
return $process->getOutput();
} }
protected function prepareForConvertPDF() protected function prepareForConvertPDF()

1
composer.json

@ -13,7 +13,6 @@
"fideloper/proxy": "^4.0", "fideloper/proxy": "^4.0",
"laravel/framework": "^6.2", "laravel/framework": "^6.2",
"laravel/tinker": "^2.0", "laravel/tinker": "^2.0",
"league/html-to-markdown": "^5.0",
"phpoffice/phpword": "^0.17.0", "phpoffice/phpword": "^0.17.0",
"predis/predis": "^1.1", "predis/predis": "^1.1",
"spatie/laravel-webhook-server": "^1.13", "spatie/laravel-webhook-server": "^1.13",

Loading…
Cancel
Save