You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
89 lines
2.1 KiB
89 lines
2.1 KiB
<?php
|
|
|
|
namespace App\Ingest;
|
|
|
|
use Illuminate\Support\Facades\Storage;
|
|
|
|
class DataJsonConvertor extends AbstractConvertor
|
|
{
|
|
protected $type;
|
|
|
|
public function __construct($path, $type)
|
|
{
|
|
parent::__construct(Storage::disk('local'), $path);
|
|
|
|
$this->type = $type;
|
|
}
|
|
|
|
/**
|
|
* Convert given document to JSON file which contains the document's data.
|
|
*
|
|
* @throws \Exception
|
|
*/
|
|
public function execute()
|
|
{
|
|
// if ($this->type === 'pdf') {
|
|
// $this->convertToDocx();
|
|
// }
|
|
|
|
if ($this->type !== 'docx') {
|
|
$this->convertToDocx();
|
|
}
|
|
|
|
$json = $this->convertDocxToJson();
|
|
|
|
$this->storage->put("$this->directoryPath/document.json", json_encode($json));
|
|
|
|
$this->deleteOriginalDocument();
|
|
}
|
|
|
|
protected function convertDocxToJson()
|
|
{
|
|
$reader = new DocxReader($this->storage, $this->path);
|
|
|
|
return $reader->execute();
|
|
}
|
|
|
|
/**
|
|
* Convert document to DOCX format in order to extract data.
|
|
*
|
|
* @throws \Exception
|
|
*/
|
|
protected function convertToDocx()
|
|
{
|
|
$office = new Office();
|
|
|
|
$convertTo = 'docx';
|
|
|
|
if ($this->fileIsPDF($this->path)) {
|
|
$convertTo = 'docx:writer_pdf_Export';
|
|
}
|
|
|
|
$success = $office->run(
|
|
$convertTo,
|
|
$this->storage->path($this->path),
|
|
$this->storage->path($this->directoryPath)
|
|
);
|
|
|
|
if (! $success) {
|
|
throw new \Exception('Failed when converting from ' . $this->type . ' to DOCX for file: ' . $this->path);
|
|
}
|
|
|
|
$this->deleteOriginalDocument();
|
|
|
|
$this->setPath(str_replace($this->type, 'docx', $this->path));
|
|
|
|
if ( ! $this->storage->exists($this->path)) {
|
|
throw new \Exception('Failed when converting from ' . $this->type . ' to DOCX for file: ' . $this->path . '. The DOCX file doesnt exist.');
|
|
}
|
|
|
|
$this->type = 'docx';
|
|
}
|
|
|
|
protected function fileIsPDF($filePath)
|
|
{
|
|
$s = '.pdf';
|
|
|
|
return substr($filePath, - strlen($s)) === $s;
|
|
}
|
|
}
|