|
|
<?php
namespace App\Ingest;
use Illuminate\Support\Facades\Storage; use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Process; use League\HTMLToMarkdown\HtmlConverter;
class Convertor { /** * @var \Illuminate\Contracts\Filesystem\Filesystem */ private $storage; private $path; protected $type;
public function __construct($path, $type) { $this->storage = Storage::disk('local'); $this->path = $path; $this->type = $type; }
/** * @return mixed * @throws \Exception */ public function execute() { if ($this->type === 'txt') { return $this->path; }
if ($this->type === 'pdf') { // $this->convertPdfToText();
$this->convertPdfToMD(); // $this->getHtmlContentsFromPdfWithImages();
return $this->path; }
if ($this->type !== 'docx') { $this->convertToDocx(); }
$this->convertDocumentToText(); //$this->convertToHtml();
return $this->path; }
/** * Convert doc,dot,rtf,odt,pdf,docx to docx * * * @return string|void */ private function convertToDocx() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
/** * Convert doc,dot,rtf,odt to docx */ $process = new Process([ 'soffice', '--headless', '--convert-to', 'docx', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace(".$this->type", '.docx', $this->path); }
/** * Convert docx file to text * * @return void */ private function convertDocumentToText() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ 'soffice', '--headless', '--convert-to', 'txt', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace(['.docx', '.bin'], '.txt', $this->path); }
protected function convertPdfToText() { $this->prepareForConvertPDF();
$images = $this->getImagesFromPDF();
$contents = $this->getTextContentsFromPDF();
if (!$contents && count($images) === 0) { throw new \Exception('Could not read from file.'); }
// Handle images and image contents.
if (count($images) > 0) { foreach ($images as $image) { try { $ocr = new OCR($this->storage->path($image));
$imageContents = $ocr->execute();
$contents = $contents . "\n" . $imageContents; } catch (\Exception $exception) { \Illuminate\Support\Facades\Log::info('something wrong: ' . $exception->getMessage()); } }
$dir = str_replace('.pdf', '', $this->path);
$this->storage->deleteDirectory($dir); }
$this->storage->delete($this->path);
$this->path = str_replace('.pdf', '.txt', $this->path);
$this->storage->put($this->path, $contents); }
protected function convertPdfToMD() { // $this->prepareForConvertPDF();
$result = $this->getContentsFromPdf();
if ( ! $result['has_images'] && ! $result['has_text']) { throw new \Exception('Cannot get pdf file contents.'); }
if ($result['has_text']) { if ($result['has_images']) { // Both text and images.
throw new \Exception('Not supported for now.'); }
// Delete directory because the contents are in the '$result' variable.
$this->storage->deleteDirectory($this->path);
$mdContents = '';
foreach ($result['htmls'] as $html) { $converter = new HtmlConverter(); $converter->getConfig()->setOption('strip_tags', true);
$contents = $converter->convert($html);
$mdContents = $mdContents . $contents; }
$this->path = "$this->path.md";
$this->storage->put($this->path, $mdContents);
return; }
// Only contains images.
$imagesContent = ''; $files = $this->storage->allFiles($this->path);
foreach ($files as $file) { // Only get the image files from the directory, it may contain some empty html files too.
if (in_array(pathinfo($file, PATHINFO_EXTENSION), ['jpg', 'png'])) { $ocr = new OCR($this->storage->path($file));
$imagesContent = $imagesContent . $ocr->execute(); } }
\Illuminate\Support\Facades\Log::info('============================'); \Illuminate\Support\Facades\Log::info($this->path);
// We are done with the images processing, delete directory.
$this->storage->deleteDirectory($this->path);
$this->path = "$this->path.md";
\Illuminate\Support\Facades\Log::info($this->path); \Illuminate\Support\Facades\Log::info('++++++++++++++++++++++++++');
$this->storage->put($this->path, $imagesContent); }
private function convertToHtml() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ 'soffice', '--headless', '--convert-to', 'html:HTML:EmbedImages', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace(".$this->type", '.html', $this->path); }
private function convertToXML() { //Convert the file to xml using pdftohtml to xml and run a python scrypt to fix the paragraphs
$process = new Process([ 'pdftohtml', '-xml', '-i', $this->storage->path($this->path) ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace(".$this->type", '.xml', $this->path); }
protected function prepareForConvertPDF() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ 'pip3', 'install', 'pdftotext', ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } }
protected function getImagesFromPDF() { $dir = str_replace('.pdf', '', $this->path);
$this->storage->makeDirectory($dir);
$process = new Process([ 'pdfimages', '-p', $this->storage->path($this->path), '-tiff', $this->storage->path("$dir/ocr") ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
return $this->storage->allFiles($dir); }
protected function getTextContentsFromPDF() { $outputPath = $this->storage->path(str_replace('.pdf', '.txt', $this->path));
$process = new Process([ 'python3', storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'), '-i', $this->storage->path($this->path), '-o', $outputPath ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
return file_get_contents($outputPath); }
protected function getHtmlContentsFromPdfWithImages() { $dirName = str_replace('.pdf', '', $this->path); $this->storage->makeDirectory($dirName);
$outputPath = $this->storage->path("$dirName/html");
$process = new Process([ 'pdftohtml', '-noframes', $this->storage->path($this->path), $outputPath ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = $dirName;
$converter = new HtmlConverter(); $converter->getConfig()->setOption('strip_tags', true);
$files = $this->storage->allFiles($this->path);
$htmlFileIndex = null;
foreach ($files as $index => $file) { // if (pathinfo($file, PATHINFO_BASENAME) === 'html-html.html') {
// if (pathinfo($file, PATHINFO_EXTENSION) === 'html') {
if (pathinfo($file, PATHINFO_BASENAME) === 'html.html') { $htmlFileIndex = $index;
break; } }
$htmlContents = $this->storage->get($files[$htmlFileIndex]); $contents = $converter->convert($htmlContents);
// $this->storage->deleteDirectory($this->path);
$this->path = "$this->path.md";
$this->storage->put($this->path, $contents);
dd(3); }
protected function getContentsFromPdf() { $dirName = str_replace('.pdf', '', $this->path); $this->storage->makeDirectory($dirName);
$outputPath = $this->storage->path("$dirName/html");
$process = new Process([ 'pdftohtml', '-xml', $this->storage->path($this->path), $outputPath ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = $dirName;
$contents = $this->storage->get("$this->path/html.xml");
$xml = simplexml_load_string($contents);
$fonts = [];
foreach ($xml->page as $page) { foreach ($page as $p) { if ($p->getName() === 'fontspec') { $fonts[(int) $p['id']]['family'] = (string) $p['family']; $fonts[(int) $p['id']]['size'] = (string) $p['size']; $fonts[(int) $p['id']]['color'] = (string) $p['color']; } } }
$htmls = []; $hasImages = false; $hasText = false;
try { foreach ($xml->page as $page) { $html = '';
$previousP = null;
foreach ($page as $p) { if ($p->getName() == 'image') { $html = $html . '<img style="position: absolute; top: ' . $p['top'] . 'px; left: ' . $p['left'] . 'px;" width="' . $p['width'] . '" height="' . $p['height'] . '" src="' . $p['src'] . '">';
$hasImages = true; }
if ($p->getName() == 'text') { $id = (int) $p['font']; $font_size = $fonts[$id]['size']; $font_color = $fonts[$id]['color']; $font_family = $fonts[$id]['family'];
$style = ''; $style = $style . 'position: absolute;'; $style = $style . "color: $font_color;"; $style = $style . "font-family: $font_family;"; $style = $style . "font-weight: 900;"; $style = $style . "width: " . $p['width'] . "px;"; $style = $style . "height: " . $p['height'] . "px;"; $style = $style . "top: " . $p['top'] . "px;"; $style = $style . "left: " . $p['left'] . "px;";
// $style = $style . "font-size: $font_size" . "px;";
if ($p->i) { $content = '<i>' . $p->i . '</i>'; } else if ($p->b) { $content = '<b>' . $p->b . '</b>'; } else { $content = $p; }
// @TODO Must chain paragraphs if top are almost same.
$tag = $this->getTag($p, $previousP, $font_size);
$html = $html . '<' . $tag . ' style="' . $style . '">' . $content . '</' . $tag . '>';
$hasText = true; }
$previousP = $p; }
$htmls[] = '<html><head><title></title></head><body>' . $html . '</body></html>'; } } catch (\Exception $exception) { \Illuminate\Support\Facades\Log::info($exception->getTraceAsString()); }
return [ 'has_images' => $hasImages, 'has_text' => $hasText, 'htmls' => $htmls, ]; }
protected function getTag($p, $previousP, $size) { if ($size > 24) { return 'h1'; }
if ($size > 18) { return 'h2'; }
if ($size > 16) { return 'h3'; }
if ($previousP && $p['top'] - $previousP['top'] <= 5) { return 'span'; }
return 'p'; } }
|