Browse Source

Text conversion improvement

hidden_tags_with_bookmarks
Orzu Ionut 4 years ago
parent
commit
4b135c4fc4
  1. 1
      README.md
  2. 4
      app/Http/Controllers/IngestController.php
  3. 12
      app/Ingest/MDConvertor.php
  4. 27
      app/Jobs/IngestDocuments.php
  5. 7
      app/Jobs/SendToCore.php
  6. 87
      app/Parser/ParseTextArray.php

1
README.md

@ -24,6 +24,7 @@ apt-get update
apt-get install software-properies-common apt-get install software-properies-common
add-apt-repository ppa:deadsnakes/ppa add-apt-repository ppa:deadsnakes/ppa
apt-get install supervisor python3.8 python3.8-dev apt-get install supervisor python3.8 python3.8-dev
apt-get install redis-server
supervisorctl restart all supervisorctl restart all
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py

4
app/Http/Controllers/IngestController.php

@ -18,9 +18,9 @@ class IngestController extends Controller
$handler->handle(); $handler->handle();
return response()->json([
return response()->json(array(
'status' => 'processing', 'status' => 'processing',
]);
));
} catch (\Exception $exception) { } catch (\Exception $exception) {
return response()->json([ return response()->json([
'status' => 'error', 'status' => 'error',

12
app/Ingest/MDConvertor.php

@ -8,7 +8,7 @@ class MDConvertor
public function __construct($content) public function __construct($content)
{ {
$this->content = json_decode($content, true);
$this->content = $content;
} }
public function execute() public function execute()
@ -26,7 +26,7 @@ class MDConvertor
' ' . ' ' .
(isset($paragraph['numbering']) ? $paragraph['numbering'] : '') . (isset($paragraph['numbering']) ? $paragraph['numbering'] : '') .
' ' . ' ' .
$paragraph['content'] .
$this->parseContent($paragraph['content']) .
"\n"; "\n";
if ( if (
@ -42,4 +42,12 @@ class MDConvertor
return $content; return $content;
} }
protected function parseContent($content)
{
$content = preg_replace("/\xE2\x80\x8B/", "", $content);
$content = preg_replace("/[\x{200B}-\x{200D}\x{FEFF}]/u", "", $content);
return utf8_decode($content);
}
} }

27
app/Jobs/IngestDocuments.php

@ -62,6 +62,13 @@ class IngestDocuments implements ShouldQueue
{ {
$this->path = $path; $this->path = $path;
$this->type = $type; $this->type = $type;
$this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx();
$this->parserText = new ParseTextArray();
$this->parserXml = new ParseXml();
$this->parserHtml = new ParseHtml();
$this->parseHtmlArray = new ParseHtmlArray();
} }
/** /**
@ -71,19 +78,16 @@ class IngestDocuments implements ShouldQueue
*/ */
public function handle() public function handle()
{ {
$this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx();
$this->parserText = new ParseTextArray();
$this->parserXml = new ParseXml();
$this->parserHtml = new ParseHtml();
$this->parseHtmlArray = new ParseHtmlArray();
$convertor = new Convertor($this->path, $this->type); $convertor = new Convertor($this->path, $this->type);
$this->path = $convertor->execute(); $this->path = $convertor->execute();
$content = $this->getContent(); $content = $this->getContent();
$content = $this->convertToUTF8($content);
if ( ! $content) {
return;
}
// $content = $this->convertToUTF8($content);
try { try {
$filePath = $this->storeContent($content); $filePath = $this->storeContent($content);
@ -96,8 +100,11 @@ class IngestDocuments implements ShouldQueue
} }
} }
protected function failed()
public function failed()
{ {
Log::error('Ingest documents failed.');
// @TODO Delete docx, txt and md files.
if ($this->storage->exists($this->path)) { if ($this->storage->exists($this->path)) {
$this->storage->delete($this->path); $this->storage->delete($this->path);
} }
@ -133,7 +140,7 @@ class IngestDocuments implements ShouldQueue
} }
); );
return utf8_encode(json_encode($content));
return $content;
} }
protected function storeContent($content) protected function storeContent($content)

7
app/Jobs/SendToCore.php

@ -72,6 +72,13 @@ class SendToCore implements ShouldQueue
} }
} }
public function failed()
{
if ($this->filePath) {
$this->storage->delete($this->filePath);
}
}
/** /**
* Send the data to the core trough webhooks * Send the data to the core trough webhooks
* *

87
app/Parser/ParseTextArray.php

@ -125,7 +125,6 @@ class ParseTextArray
} else { } else {
Log::error('The given file dose not exists!'); Log::error('The given file dose not exists!');
} }
} }
@ -144,6 +143,7 @@ class ParseTextArray
$alreadyHandled = []; $alreadyHandled = [];
$countData = count($textAsArray); $countData = count($textAsArray);
for ($i = 0; $i < $countData; $i++) { for ($i = 0; $i < $countData; $i++) {
if (array_key_exists($i, $alreadyHandled)) { if (array_key_exists($i, $alreadyHandled)) {
continue; continue;
@ -154,16 +154,19 @@ class ParseTextArray
'content' => trim($textAsArray[ $i ]), 'content' => trim($textAsArray[ $i ]),
'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ])) 'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ]))
]; ];
//Remove numbering from the paragraph content //Remove numbering from the paragraph content
if ($numbering = $this->getNumbering($textAsArray[ $i ])) { if ($numbering = $this->getNumbering($textAsArray[ $i ])) {
$data[ $i ][ 'numbering' ] = $numbering; $data[ $i ][ 'numbering' ] = $numbering;
$data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.')); $data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.'));
} }
if ($this->pdf && strpos($textAsArray[ $i ], 'Page') !== false && strpos($textAsArray[ $i ], if ($this->pdf && strpos($textAsArray[ $i ], 'Page') !== false && strpos($textAsArray[ $i ],
'of') !== false) { 'of') !== false) {
$alreadyHandled[] = $i; $alreadyHandled[] = $i;
break; break;
} }
$j = $i + 1; $j = $i + 1;
if (isset($textAsArray[ $j ])) { if (isset($textAsArray[ $j ])) {
@ -171,67 +174,91 @@ class ParseTextArray
if (array_key_exists($j, $alreadyHandled)) { if (array_key_exists($j, $alreadyHandled)) {
continue; continue;
} }
if ($this->pdf && isset($textAsArray[ $j ]) && strpos($textAsArray[ $j ],
'Page') !== false && strpos($textAsArray[ $j ], 'of') !== false) {
if (
$this->pdf &&
isset($textAsArray[ $j ]) &&
strpos($textAsArray[ $j ], 'Page') !== false &&
strpos($textAsArray[ $j ], 'of') !== false
) {
$alreadyHandled[] = $j; $alreadyHandled[] = $j;
continue;
continue;
} }
//extract the content and count the number of the empty spaces from the begining
// Extract the content and count the number of the empty spaces from the beginning
$data[ $j ] = [ $data[ $j ] = [
'content' => trim($textAsArray[ $j ]), 'content' => trim($textAsArray[ $j ]),
'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ])) 'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ]))
]; ];
//Remove numbering from the paragraph content
// Remove numbering from the paragraph content
if ($numbering = $this->getNumbering($textAsArray[ $j ])) { if ($numbering = $this->getNumbering($textAsArray[ $j ])) {
$data[ $j ][ 'numbering' ] = $numbering; $data[ $j ][ 'numbering' ] = $numbering;
$data[ $j ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]),
'.'));
$data[ $j ][ 'content' ] = trim(
ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), '.')
);
} }
//break if outh have numbering and the space is equal
if ($data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && $this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && substr_count($data[ $i ][ 'numbering' ],
'.') == substr_count($data[ $j ][ 'numbering' ],
'.') && count(array_filter(str_split($data[ $i ][ 'numbering' ]),
'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]),
'is_numeric'))) {
// Break if both have numbering and the space is equal
if (
$data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] &&
$this->hasNumbering($data[ $j ]) &&
$this->hasNumbering($data[ $i ]) &&
substr_count($data[ $i ][ 'numbering' ], '.') == substr_count($data[ $j ][ 'numbering' ], '.') &&
count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric'))) {
break; break;
} }
if ($this->hasNumbering($data[ $j ]) && ! $this->hasNumbering($data[ $i ]) && ! $data[ $i ][ 'spaces' ] && $data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && ! in_array(substr($data[ $i ][ 'content' ],
-1), [':'])) {
if (
$this->hasNumbering($data[ $j ]) &&
! $this->hasNumbering($data[ $i ]) &&
! $data[ $i ][ 'spaces' ] &&
$data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] &&
! in_array(substr($data[ $i ][ 'content' ], -1), [':'])
) {
break; break;
} }
if ($this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1) {
if (
$this->hasNumbering($data[ $j ]) &&
$this->hasNumbering($data[ $i ]) &&
((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
) {
break; break;
} }
if ($this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1) {
if (
$this->hasNumbering($data[ $j ]) &&
$this->hasNumbering($data[ $i ]) &&
((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
) {
break; break;
} }
//Hardcoded breakpoints
if ($this->hasNumbering($data[ $j ]) && in_array(strtoupper(str_replace(['.', "\t", ""], '',
$data[ $j ][ 'content' ])), $this->breakPoints)) {
// Hardcoded breakpoints
if (
$this->hasNumbering($data[ $j ]) &&
in_array(strtoupper(str_replace(['.', "\t", ""], '', $data[ $j ][ 'content' ])), $this->breakPoints)
) {
break; break;
} }
//Hardcoded "Schedule break"
if (! $this->hasNumbering($data[ $j ]) && strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))),
0, 10), 'schedule') !== false) {
// Hardcoded "Schedule break"
if (
! $this->hasNumbering($data[ $j ]) &&
strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), 0, 10), 'schedule') !== false
) {
break; break;
} }
if (! $this->hasNumbering($data[ $j ]) && strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15),
'Exhibit') !== false && ! in_array(substr(trim($data[ $j ][ 'content' ]), -1), [
'.'
])) {
if (
! $this->hasNumbering($data[ $j ]) &&
strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), 'Exhibit') !== false &&
! in_array(substr(trim($data[ $j ][ 'content' ]), -1), ['.'])
) {
break; break;
} }

Loading…
Cancel
Save