|
|
<?php
namespace App\Ingest;
use Illuminate\Support\Facades\Storage; use Symfony\Component\Process\Exception\ProcessFailedException; use Symfony\Component\Process\Process;
class Convertor { /** * @var \Illuminate\Contracts\Filesystem\Filesystem */ private $storage; private $path; protected $type;
public function __construct($path, $type) { $this->storage = Storage::disk('local'); $this->path = $path; $this->type = $type; }
public function execute() { if ($this->type === 'txt') { return $this->path; }
if ($this->type === 'pdf') { $this->convertPdfToText();
return $this->path; }
if ($this->type !== 'docx') { $this->convertToDocx(); }
$this->convertDocumentToText(); //$this->convertToHtml();
return $this->path; }
/** * Convert doc,dot,rtf,odt,pdf,docx to docx * * * @return string|void */ private function convertToDocx() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
/** * Convert doc,dot,rtf,odt to docx */ $process = new Process([ 'soffice', '--headless', '--convert-to', 'docx', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace($this->type, 'docx', $this->path); }
/** * Convert docx file to text * * * @return string|void */ private function convertDocumentToText() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ 'soffice', '--headless', '--convert-to', 'txt', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace(['.docx', '.bin'], '.txt', $this->path); }
private function convertPdfToText() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ 'pip3', 'install', 'pdftotext', ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$process = new Process([ 'python3', storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'), '-i', $this->storage->path($this->path), '-o', $this->storage->path(str_replace('.pdf', '.txt', $this->path)) ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace('pdf', 'txt', $this->path); }
private function convertToHtml() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ 'soffice', '--headless', '--convert-to', 'html:HTML:EmbedImages', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace($this->type, 'html', $this->path); }
private function convertToXML() { //Convert the file to xml using pdftohtml to xml and run a python scrypt to fix the paragraphs
$process = new Process([ 'pdftohtml', '-xml', '-i', $this->storage->path($this->path) ]);
$process->run();
if (!$process->isSuccessful()) { throw new ProcessFailedException($process); }
$this->storage->delete($this->path);
$this->path = str_replace($this->type, 'xml', $this->path); } }
|