|
|
@ -2,7 +2,6 @@ |
|
|
|
|
|
|
|
namespace App\Ingest; |
|
|
|
|
|
|
|
use League\HTMLToMarkdown\HtmlConverter; |
|
|
|
use Symfony\Component\Process\Exception\ProcessFailedException; |
|
|
|
use Symfony\Component\Process\Process; |
|
|
|
|
|
|
@ -98,7 +97,7 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
foreach ($page as $items) { |
|
|
|
$continuousP = ''; |
|
|
|
|
|
|
|
foreach ($items as $p) { |
|
|
|
foreach ($items as $key => $p) { |
|
|
|
if ($p->getName() == 'image') { |
|
|
|
$basePath = $this->storage->path(''); |
|
|
|
$imageFilePath = str_replace($basePath, '', $p['src']); |
|
|
@ -138,7 +137,16 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
} |
|
|
|
|
|
|
|
if ($p->getName() == 'text') { |
|
|
|
$continuousP = $continuousP . $this->handleText($p, $fonts); |
|
|
|
if($p == '·') { |
|
|
|
continue; |
|
|
|
} |
|
|
|
|
|
|
|
$addition = null; |
|
|
|
if(isset($items[$key-1]) && $items[$key-1] == '·') { |
|
|
|
$addition = '· '; |
|
|
|
} |
|
|
|
|
|
|
|
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition); |
|
|
|
|
|
|
|
$hasText = true; |
|
|
|
} |
|
|
@ -179,17 +187,17 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
{ |
|
|
|
$html = ''; |
|
|
|
|
|
|
|
$src = './' . pathinfo($p['src'], PATHINFO_BASENAME); |
|
|
|
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME); |
|
|
|
|
|
|
|
$html = $html . '<br>'; |
|
|
|
$html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">'; |
|
|
|
$html = $html . '<img style="position: relative; width:' . $p['width'] . 'px; height:' . $p['height'] . 'px;" src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">'; |
|
|
|
$html = $html . '<br>'; |
|
|
|
$html = $html . '<br>'; |
|
|
|
|
|
|
|
return $html; |
|
|
|
} |
|
|
|
|
|
|
|
protected function handleText($p, $fonts) |
|
|
|
protected function handleText($p, $fonts, $addition = null) |
|
|
|
{ |
|
|
|
$id = (int) $p['font']; |
|
|
|
$font_size = $fonts[$id]['size']; |
|
|
@ -197,7 +205,7 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
$font_family = $fonts[$id]['family']; |
|
|
|
|
|
|
|
$style = ''; |
|
|
|
$style = $style . 'position: absolute;'; |
|
|
|
$style = $style . 'position: relative;'; |
|
|
|
$style = $style . "color: $font_color;"; |
|
|
|
$style = $style . "font-family: $font_family;"; |
|
|
|
$style = $style . "font-weight: 900;"; |
|
|
@ -211,10 +219,16 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
$content = '<i>' . $p->i . '</i>'; |
|
|
|
} else if ($p->b) { |
|
|
|
$content = '<b>' . $p->b . '</b>'; |
|
|
|
} else if ($p->a) { |
|
|
|
$content = $p . '<a>' . $p->a . '</a>'; |
|
|
|
} else { |
|
|
|
$content = $p; |
|
|
|
} |
|
|
|
|
|
|
|
if($addition) { |
|
|
|
$content = $addition . $content; |
|
|
|
} |
|
|
|
|
|
|
|
$tag = $this->getTag($font_size); |
|
|
|
|
|
|
|
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>'; |
|
|
@ -236,7 +250,7 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
return 'h3'; |
|
|
|
} |
|
|
|
|
|
|
|
return 'span'; |
|
|
|
return 'p'; |
|
|
|
} |
|
|
|
|
|
|
|
protected function applyOCR($path) |
|
|
@ -249,11 +263,27 @@ class PDFConvertor extends AbstractConvertor |
|
|
|
protected function convertHtmlToMD($contents) |
|
|
|
{ |
|
|
|
$html = '<html><head><title></title></head><body>' . $contents . '</body></html>'; |
|
|
|
$filepath = $this->storage->path($this->directoryPath); |
|
|
|
|
|
|
|
file_put_contents($filepath . '/document.html', $html); |
|
|
|
|
|
|
|
$converter = new HtmlConverter(); |
|
|
|
$converter->getConfig()->setOption('strip_tags', true); |
|
|
|
$process = new Process([ |
|
|
|
'pandoc', |
|
|
|
'-f', |
|
|
|
'html', |
|
|
|
$filepath . '/document.html', |
|
|
|
'-t', |
|
|
|
'markdown_strict', |
|
|
|
]); |
|
|
|
|
|
|
|
$process->run(); |
|
|
|
|
|
|
|
return $converter->convert($html); |
|
|
|
|
|
|
|
if (!$process->isSuccessful()) { |
|
|
|
throw new ProcessFailedException($process); |
|
|
|
} |
|
|
|
|
|
|
|
return $process->getOutput(); |
|
|
|
} |
|
|
|
|
|
|
|
protected function prepareForConvertPDF() |
|
|
|