Browse Source

Fix issue with handling PDF files. Add support for plain text files.

hidden_tags_with_bookmarks
Orzu Ionut 3 years ago
parent
commit
17aa1ab326
  1. 13
      app/Ingest/Convertor.php
  2. 2
      app/Ingest/DocumentHandler.php
  3. 12
      app/Jobs/IngestDocuments.php
  4. 18
      app/Jobs/SendToCore.php

13
app/Ingest/Convertor.php

@ -24,6 +24,10 @@ class Convertor
public function execute() public function execute()
{ {
if ($this->type === 'txt') {
return $this->path;
}
if ($this->type === 'pdf') { if ($this->type === 'pdf') {
$this->convertPdfToText(); $this->convertPdfToText();
@ -107,10 +111,12 @@ class Convertor
private function convertPdfToText() private function convertPdfToText()
{ {
(new Process(['export HOME=' . env('USER_HOME_PATH')]))->run();
$process = new Process([ $process = new Process([
'pip',
'pip3',
'install', 'install',
"pdftotext"
'pdftotext',
]); ]);
$process->run(); $process->run();
@ -119,9 +125,6 @@ class Convertor
throw new ProcessFailedException($process); throw new ProcessFailedException($process);
} }
/**
* Convert pdf to text
*/
$process = new Process([ $process = new Process([
'python3', 'python3',
storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'), storage_path('scripts' . DIRECTORY_SEPARATOR . 'parse-pdf.py'),

2
app/Ingest/DocumentHandler.php

@ -18,6 +18,7 @@ class DocumentHandler
const PDF_WPS_MIME_TYPE = 'application/wps-office.pdf'; const PDF_WPS_MIME_TYPE = 'application/wps-office.pdf';
const DOCXOLD_MIME_TYPE = 'application/octet-stream'; const DOCXOLD_MIME_TYPE = 'application/octet-stream';
const DOCX_WPS_TYPE = 'application/wps-office.docx'; const DOCX_WPS_TYPE = 'application/wps-office.docx';
const PLAIN_TEXT_TYPE = 'text/plain';
protected $supportedFiles = [ protected $supportedFiles = [
self::DOCX_MIME_TYPE => 'docx', self::DOCX_MIME_TYPE => 'docx',
@ -28,6 +29,7 @@ class DocumentHandler
self::ODT_MIME_TYPE => 'odt', self::ODT_MIME_TYPE => 'odt',
self::PDF_MIME_TYPE => 'pdf', self::PDF_MIME_TYPE => 'pdf',
self::PDF_WPS_MIME_TYPE => 'pdf', self::PDF_WPS_MIME_TYPE => 'pdf',
self::PLAIN_TEXT_TYPE => 'txt',
]; ];
public function __construct($id, $document) public function __construct($id, $document)

12
app/Jobs/IngestDocuments.php

@ -48,11 +48,6 @@ class IngestDocuments implements ShouldQueue
*/ */
private $parseHtmlArray; private $parseHtmlArray;
/**
* @var \App\Parser\ParseTextArray
*/
private $parserText;
/** /**
* Create a new job instance. * Create a new job instance.
* *
@ -65,7 +60,6 @@ class IngestDocuments implements ShouldQueue
$this->storage = Storage::disk('local'); $this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx(); $this->parserDocx = new ParseDocx();
$this->parserText = new ParseTextArray();
$this->parserXml = new ParseXml(); $this->parserXml = new ParseXml();
$this->parserHtml = new ParseHtml(); $this->parserHtml = new ParseHtml();
$this->parseHtmlArray = new ParseHtmlArray(); $this->parseHtmlArray = new ParseHtmlArray();
@ -87,7 +81,7 @@ class IngestDocuments implements ShouldQueue
return; return;
} }
// $content = $this->convertToUTF8($content);
$content = $this->convertToUTF8($content);
try { try {
$filePath = $this->storeContent($content); $filePath = $this->storeContent($content);
@ -129,7 +123,9 @@ class IngestDocuments implements ShouldQueue
return $textParser->fromFile($this->storage->path($this->path)); return $textParser->fromFile($this->storage->path($this->path));
} }
return $this->parserText->fromFile($this->storage->path($this->path));
$textParser = new ParseTextArray();
return $textParser->fromFile($this->storage->path($this->path));
} }
protected function convertToUTF8($content) protected function convertToUTF8($content)

18
app/Jobs/SendToCore.php

@ -47,6 +47,7 @@ class SendToCore implements ShouldQueue
* Execute the job. * Execute the job.
* *
* @return void * @return void
* @throws \Illuminate\Contracts\Filesystem\FileNotFoundException
*/ */
public function handle() public function handle()
{ {
@ -83,16 +84,16 @@ class SendToCore implements ShouldQueue
* Send the data to the core trough webhooks * Send the data to the core trough webhooks
* *
* @param $content * @param $content
* @param string $status
* @return bool
*/ */
private function sendTheData($content)
protected function sendTheData($content)
{ {
try { try {
WebhookCall::create() WebhookCall::create()
->url($this->url) ->url($this->url)
->payload(['data' => [ ->payload(['data' => [
'id' => $this->id, 'id' => $this->id,
'content' => $content,
'content' => $this->encodeContent($content),
'status' => $content ? 'success' : 'fail', 'status' => $content ? 'success' : 'fail',
]]) ]])
->useSecret($this->secret) ->useSecret($this->secret)
@ -105,4 +106,15 @@ class SendToCore implements ShouldQueue
return false; return false;
} }
} }
protected function encodeContent($content)
{
$encoding = mb_detect_encoding($content, 'UTF-8, ISO-8859-1, WINDOWS-1252, WINDOWS-1251', true);
if ($encoding != 'UTF-8') {
$content = iconv($encoding, 'UTF-8//IGNORE', $content);
}
return $content;
}
} }
Loading…
Cancel
Save