diff --git a/app/Ingest/Convertor.php b/app/Ingest/Convertor.php index 80391aa..1883c75 100644 --- a/app/Ingest/Convertor.php +++ b/app/Ingest/Convertor.php @@ -24,6 +24,10 @@ class Convertor public function execute() { + if ($this->type === 'txt') { + return $this->path; + } + if ($this->type === 'pdf') { $this->convertPdfToText(); @@ -107,10 +111,12 @@ class Convertor private function convertPdfToText() { + (new Process(['export HOME=' . env('USER_HOME_PATH')]))->run(); + $process = new Process([ - 'pip', + 'pip3', 'install', - "pdftotext" + 'pdftotext', ]); $process->run(); @@ -119,9 +125,6 @@ class Convertor throw new ProcessFailedException($process); } - /** - * Convert pdf to text - */ $process = new Process([ 'python3', storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'), diff --git a/app/Ingest/DocumentHandler.php b/app/Ingest/DocumentHandler.php index 59d4677..cf9a8db 100644 --- a/app/Ingest/DocumentHandler.php +++ b/app/Ingest/DocumentHandler.php @@ -18,6 +18,7 @@ class DocumentHandler const PDF_WPS_MIME_TYPE = 'application/wps-office.pdf'; const DOCXOLD_MIME_TYPE = 'application/octet-stream'; const DOCX_WPS_TYPE = 'application/wps-office.docx'; + const PLAIN_TEXT_TYPE = 'text/plain'; protected $supportedFiles = [ self::DOCX_MIME_TYPE => 'docx', @@ -28,6 +29,7 @@ class DocumentHandler self::ODT_MIME_TYPE => 'odt', self::PDF_MIME_TYPE => 'pdf', self::PDF_WPS_MIME_TYPE => 'pdf', + self::PLAIN_TEXT_TYPE => 'txt', ]; public function __construct($id, $document) diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php index 24b81d3..a69ddd2 100644 --- a/app/Jobs/IngestDocuments.php +++ b/app/Jobs/IngestDocuments.php @@ -48,11 +48,6 @@ class IngestDocuments implements ShouldQueue */ private $parseHtmlArray; - /** - * @var \App\Parser\ParseTextArray - */ - private $parserText; - /** * Create a new job instance. * @@ -65,7 +60,6 @@ class IngestDocuments implements ShouldQueue $this->storage = Storage::disk('local'); $this->parserDocx = new ParseDocx(); - $this->parserText = new ParseTextArray(); $this->parserXml = new ParseXml(); $this->parserHtml = new ParseHtml(); $this->parseHtmlArray = new ParseHtmlArray(); @@ -87,7 +81,7 @@ class IngestDocuments implements ShouldQueue return; } -// $content = $this->convertToUTF8($content); + $content = $this->convertToUTF8($content); try { $filePath = $this->storeContent($content); @@ -129,7 +123,9 @@ class IngestDocuments implements ShouldQueue return $textParser->fromFile($this->storage->path($this->path)); } - return $this->parserText->fromFile($this->storage->path($this->path)); + $textParser = new ParseTextArray(); + + return $textParser->fromFile($this->storage->path($this->path)); } protected function convertToUTF8($content) diff --git a/app/Jobs/SendToCore.php b/app/Jobs/SendToCore.php index abc3470..903c5b5 100644 --- a/app/Jobs/SendToCore.php +++ b/app/Jobs/SendToCore.php @@ -47,6 +47,7 @@ class SendToCore implements ShouldQueue * Execute the job. * * @return void + * @throws \Illuminate\Contracts\Filesystem\FileNotFoundException */ public function handle() { @@ -83,16 +84,16 @@ class SendToCore implements ShouldQueue * Send the data to the core trough webhooks * * @param $content - * @param string $status + * @return bool */ - private function sendTheData($content) + protected function sendTheData($content) { try { WebhookCall::create() ->url($this->url) ->payload(['data' => [ 'id' => $this->id, - 'content' => $content, + 'content' => $this->encodeContent($content), 'status' => $content ? 'success' : 'fail', ]]) ->useSecret($this->secret) @@ -105,4 +106,15 @@ class SendToCore implements ShouldQueue return false; } } + + protected function encodeContent($content) + { + $encoding = mb_detect_encoding($content, 'UTF-8, ISO-8859-1, WINDOWS-1252, WINDOWS-1251', true); + + if ($encoding != 'UTF-8') { + $content = iconv($encoding, 'UTF-8//IGNORE', $content); + } + + return $content; + } }