Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

314 lines
9.3 KiB

<?php
namespace App\Ingest;
use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process;
use Illuminate\Support\Facades\Log;
use thiagoalessio\TesseractOCR\TesseractOcrException;
class PDFConvertor extends AbstractConvertor
{
public function execute()
{
// $this->prepareForConvertPDF();
$contents = $this->getFileContents();
if ( ! $contents) {
throw new \Exception('Cannot get pdf file contents.');
}
$this->storage->put("$this->directoryPath/document.html", $contents);
}
protected function getFileContents()
{
$outputPath = $this->storage->path("$this->directoryPath/html");
$process = new Process([
'pdftohtml',
'-xml',
$this->storage->path($this->path),
$outputPath
]);
$process->run();
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
}
// Remove original document.
$this->storage->delete($this->path);
return $this->getDataFromXML();
}
protected function getDataFromXML()
{
$xmlFilePath = "$this->directoryPath/html.xml";
$contents = $this->storage->get($xmlFilePath);
$xml = simplexml_load_string($contents);
$orderedList = [];
$fonts = [];
foreach ($xml->page as $page) {
$pageNumber = (int) $page['number'][0];
$orderedList[$pageNumber] = [];
foreach ($page as $p) {
if ($p->getName() === 'fontspec') {
$fonts[(int) $p['id']]['family'] = (string) substr($p['family'], strpos($p['family'], '+') + 1);
$fonts[(int) $p['id']]['size'] = (string) $p['size'];
$fonts[(int) $p['id']]['color'] = (string) $p['color'];
}
if (isset($p['top'])) {
$top = (int) $p['top'];
if ( ! array_key_exists($top, $orderedList[$pageNumber])) {
$orderedList[$pageNumber][$top] = [];
}
$orderedList[$pageNumber][$top][] = $p;
}
}
ksort($orderedList[$pageNumber]);
}
$hasImages = false;
$hasText = false;
$imagesCount = 0;
$htmlContents = '';
try {
foreach ($orderedList as $page) {
$html = '';
$footerImages = [];
foreach ($page as $items) {
$continuousP = '';
$firstOfText = true;
foreach ($items as $key => $p) {
if ($p->getName() == 'image') {
$imageInFooter = false;
$basePath = $this->storage->path('');
$imageFilePath = str_replace($basePath, '', $p['src']);
try {
$textContents = $this->applyOCR($imageFilePath);
} catch (TesseractOcrException $e) {
# Could not get text content from image. This means the image doesn't have text.
$textContents = '';
}
if ($textContents) {
$imageInFooter = true;
if ($html) {
$htmlContents = $htmlContents . $html;
$html = '';
}
$htmlContents = $htmlContents . "<div>$textContents</div>";
$this->storage->delete($imageFilePath);
$hasText = true;
} else {
$hasImages = true;
$imagesCount += 1;
$caption = "Fig. $imagesCount";
$imageHTML = $this->handleImage($p, $caption);
if (!$imageInFooter) {
$html = $html . $imageHTML;
} else {
$html = $html . "<p> $caption </p>";
$footerImages[] = $imageHTML;
}
}
}
if ($p->getName() == 'text') {
if($p == '·') {
continue;
}
$addition = null;
if(isset($items[$key-1]) && $items[$key-1] == '·') {
$addition = '· ';
}
$continuousP = $continuousP . $this->handleText($p, $fonts, $addition, $firstOfText);
$firstOfText = false;
$hasText = true;
}
}
$html = $html . '<p>' . $continuousP . '</p>';
}
if (!empty($footerImages)) {
foreach ($footerImages as $footerImage) {
$html = $html . '<p>' . $footerImage . '</p>';
}
}
$htmlContents = $htmlContents . "<html><head></head><body>$html</body></html>";
}
} catch (\Exception $exception) {
$this->storage->deleteDirectory($this->directoryPath);
Log::info($exception->getTraceAsString());
// throw new \Exception('Something went wrong.');
throw $exception;
}
if ( ! $hasText && ! $hasImages) {
// Remove directory because we do not have any use for it anymore.
$this->storage->deleteDirectory($this->directoryPath);
} else {
// Remove the unnecessary 'xml' file.
$this->storage->delete($xmlFilePath);
}
// return $mdContents;
return $htmlContents;
}
protected function handleImage($p, $caption)
{
$html = '';
$src = './contracts-images/' . pathinfo($this->directoryPath, PATHINFO_BASENAME) . '/' . pathinfo($p['src'], PATHINFO_BASENAME);
$html = $html . '<br>';
$html = $html . '<img width=' . $p['width'] . ' ' . 'height=' . $p['height'] . ' src="' . $src . '" alt="' . $caption . '" title="' . $caption . '">';
$html = $html . '<br>';
$html = $html . '<br>';
return $html;
}
protected function handleText($p, $fonts, $addition = null, $firstOfText = false)
{
$id = (int) $p['font'];
$font_size = $fonts[$id]['size'];
$font_color = $fonts[$id]['color'];
$font_family = $fonts[$id]['family'];
$style = '';
$style = $style . 'position: relative;';
$style = $style . "color: $font_color;";
$style = $style . "font-family: $font_family;";
$style = $style . "height: " . $p['height'] . "px;";
if($firstOfText)
$style = $style . "padding-left: " . (intval($p['left']) - 90) . "px;";
$style = $style . "font-size: $font_size" . "px;";
if ($p->i) {
$content = '<i>' . $p->i . '</i>';
} else if ($p->b) {
$content = '<b>' . $p->b . '</b>';
} else if ($p->a) {
$content = $p . '<a href="' . $p->a . '">' . $p->a . '</a>';
} else {
$content = $p;
}
if($addition) {
$content = $addition . $content;
}
$tag = $this->getTag($font_size);
return '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
}
protected function getTag($size)
{
// @TODO Needed to bump values up by 2, the XML loader gives different results on different servers.
if ($size > 26) {
return 'h1';
}
if ($size > 20) {
return 'h2';
}
if ($size > 18) {
return 'h3';
}
return 'span';
}
protected function applyOCR($path)
{
$ocr = new OCR($this->storage->path($path));
return $ocr->execute();
}
protected function convertHtmlToMD($contents)
{
$html = '<html><head><title></title></head><body>' . $contents . '</body></html>';
$filepath = $this->storage->path($this->directoryPath);
file_put_contents($filepath . '/document.html', $html);
$process = new Process([
'pandoc',
'-f',
'html',
$filepath . '/document.html',
'-t',
'markdown_strict',
]);
$process->run();
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
}
unlink($filepath . '/document.html');
return $process->getOutput();
}
protected function prepareForConvertPDF()
{
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([
'pip3',
'install',
'pdftotext',
]);
$process->run();
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
}
}
}