storage = Storage::disk('local'); $this->path = $path; $this->type = $type; } public function execute() { if ($this->type === 'txt') { return $this->path; } if ($this->type === 'pdf') { $this->convertPdfToText(); return $this->path; } if ($this->type !== 'docx') { $this->convertToDocx(); } $this->convertDocumentToText(); //$this->convertToHtml(); return $this->path; } /** * Convert doc,dot,rtf,odt,pdf,docx to docx * * * @return string|void */ private function convertToDocx() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); /** * Convert doc,dot,rtf,odt to docx */ $process = new Process([ 'soffice', '--headless', '--convert-to', 'docx', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } $this->storage->delete($this->path); $this->path = str_replace($this->type, 'docx', $this->path); } /** * Convert docx file to text * * * @return string|void */ private function convertDocumentToText() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); $process = new Process([ 'soffice', '--headless', '--convert-to', 'txt', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } $this->storage->delete($this->path); $this->path = str_replace(['.docx', '.bin'], '.txt', $this->path); } private function convertPdfToText() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); $process = new Process([ 'pip3', 'install', 'pdftotext', ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } $process = new Process([ 'python3', storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'), '-i', $this->storage->path($this->path), '-o', $this->storage->path(str_replace('.pdf', '.txt', $this->path)) ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } $this->storage->delete($this->path); $this->path = str_replace('pdf', 'txt', $this->path); } private function convertToHtml() { (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); $process = new Process([ 'soffice', '--headless', '--convert-to', 'html:HTML:EmbedImages', $this->storage->path($this->path), '--outdir', $this->storage->path('contracts') ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } $this->storage->delete($this->path); $this->path = str_replace($this->type, 'html', $this->path); } private function convertToXML() { //Convert the file to xml using pdftohtml to xml and run a python scrypt to fix the paragraphs $process = new Process([ 'pdftohtml', '-xml', '-i', $this->storage->path($this->path) ]); $process->run(); if (!$process->isSuccessful()) { throw new ProcessFailedException($process); } $this->storage->delete($this->path); $this->path = str_replace($this->type, 'xml', $this->path); } }