diff --git a/app/Ingest/PDFConvertor.php b/app/Ingest/PDFConvertor.php index 800b585..36af542 100644 --- a/app/Ingest/PDFConvertor.php +++ b/app/Ingest/PDFConvertor.php @@ -2,7 +2,6 @@ namespace App\Ingest; -use League\HTMLToMarkdown\HtmlConverter; use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Process; @@ -98,7 +97,7 @@ class PDFConvertor extends AbstractConvertor foreach ($page as $items) { $continuousP = ''; - foreach ($items as $p) { + foreach ($items as $key => $p) { if ($p->getName() == 'image') { $basePath = $this->storage->path(''); $imageFilePath = str_replace($basePath, '', $p['src']); @@ -138,7 +137,16 @@ class PDFConvertor extends AbstractConvertor } if ($p->getName() == 'text') { - $continuousP = $continuousP . $this->handleText($p, $fonts); + if($p == '·') { + continue; + } + + $addition = null; + if(isset($items[$key-1]) && $items[$key-1] == '·') { + $addition = '· '; + } + + $continuousP = $continuousP . $this->handleText($p, $fonts, $addition); $hasText = true; } @@ -179,17 +187,17 @@ class PDFConvertor extends AbstractConvertor { $html = ''; - $src = './' . pathinfo($p['src'], PATHINFO_BASENAME); + $src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME); $html = $html . '
'; - $html = $html . '' . $caption . ''; + $html = $html . '' . $caption . ''; $html = $html . '
'; $html = $html . '
'; return $html; } - protected function handleText($p, $fonts) + protected function handleText($p, $fonts, $addition = null) { $id = (int) $p['font']; $font_size = $fonts[$id]['size']; @@ -197,7 +205,7 @@ class PDFConvertor extends AbstractConvertor $font_family = $fonts[$id]['family']; $style = ''; - $style = $style . 'position: absolute;'; + $style = $style . 'position: relative;'; $style = $style . "color: $font_color;"; $style = $style . "font-family: $font_family;"; $style = $style . "font-weight: 900;"; @@ -211,10 +219,16 @@ class PDFConvertor extends AbstractConvertor $content = '' . $p->i . ''; } else if ($p->b) { $content = '' . $p->b . ''; + } else if ($p->a) { + $content = $p . '' . $p->a . ''; } else { $content = $p; } + if($addition) { + $content = $addition . $content; + } + $tag = $this->getTag($font_size); return '<' . $tag . ' style="' . $style . '">' . $content . ''; @@ -236,7 +250,7 @@ class PDFConvertor extends AbstractConvertor return 'h3'; } - return 'span'; + return 'p'; } protected function applyOCR($path) @@ -249,11 +263,27 @@ class PDFConvertor extends AbstractConvertor protected function convertHtmlToMD($contents) { $html = '' . $contents . ''; + $filepath = $this->storage->path($this->directoryPath); + + file_put_contents($filepath . '/document.html', $html); - $converter = new HtmlConverter(); - $converter->getConfig()->setOption('strip_tags', true); + $process = new Process([ + 'pandoc', + '-f', + 'html', + $filepath . '/document.html', + '-t', + 'markdown_strict', + ]); + + $process->run(); - return $converter->convert($html); + + if (!$process->isSuccessful()) { + throw new ProcessFailedException($process); + } + + return $process->getOutput(); } protected function prepareForConvertPDF() diff --git a/composer.json b/composer.json index 61a3b99..831f919 100644 --- a/composer.json +++ b/composer.json @@ -13,7 +13,6 @@ "fideloper/proxy": "^4.0", "laravel/framework": "^6.2", "laravel/tinker": "^2.0", - "league/html-to-markdown": "^5.0", "phpoffice/phpword": "^0.17.0", "predis/predis": "^1.1", "spatie/laravel-webhook-server": "^1.13",