Browse Source

Text conversion improvement

hidden_tags_with_bookmarks
Orzu Ionut 4 years ago
parent
commit
4b135c4fc4
  1. 1
      README.md
  2. 4
      app/Http/Controllers/IngestController.php
  3. 12
      app/Ingest/MDConvertor.php
  4. 27
      app/Jobs/IngestDocuments.php
  5. 7
      app/Jobs/SendToCore.php
  6. 87
      app/Parser/ParseTextArray.php

1
README.md

@ -24,6 +24,7 @@ apt-get update
apt-get install software-properies-common
add-apt-repository ppa:deadsnakes/ppa
apt-get install supervisor python3.8 python3.8-dev
apt-get install redis-server
supervisorctl restart all
curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py

4
app/Http/Controllers/IngestController.php

@ -18,9 +18,9 @@ class IngestController extends Controller
$handler->handle();
return response()->json([
return response()->json(array(
'status' => 'processing',
]);
));
} catch (\Exception $exception) {
return response()->json([
'status' => 'error',

12
app/Ingest/MDConvertor.php

@ -8,7 +8,7 @@ class MDConvertor
public function __construct($content)
{
$this->content = json_decode($content, true);
$this->content = $content;
}
public function execute()
@ -26,7 +26,7 @@ class MDConvertor
' ' .
(isset($paragraph['numbering']) ? $paragraph['numbering'] : '') .
' ' .
$paragraph['content'] .
$this->parseContent($paragraph['content']) .
"\n";
if (
@ -42,4 +42,12 @@ class MDConvertor
return $content;
}
protected function parseContent($content)
{
$content = preg_replace("/\xE2\x80\x8B/", "", $content);
$content = preg_replace("/[\x{200B}-\x{200D}\x{FEFF}]/u", "", $content);
return utf8_decode($content);
}
}

27
app/Jobs/IngestDocuments.php

@ -62,6 +62,13 @@ class IngestDocuments implements ShouldQueue
{
$this->path = $path;
$this->type = $type;
$this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx();
$this->parserText = new ParseTextArray();
$this->parserXml = new ParseXml();
$this->parserHtml = new ParseHtml();
$this->parseHtmlArray = new ParseHtmlArray();
}
/**
@ -71,19 +78,16 @@ class IngestDocuments implements ShouldQueue
*/
public function handle()
{
$this->storage = Storage::disk('local');
$this->parserDocx = new ParseDocx();
$this->parserText = new ParseTextArray();
$this->parserXml = new ParseXml();
$this->parserHtml = new ParseHtml();
$this->parseHtmlArray = new ParseHtmlArray();
$convertor = new Convertor($this->path, $this->type);
$this->path = $convertor->execute();
$content = $this->getContent();
$content = $this->convertToUTF8($content);
if ( ! $content) {
return;
}
// $content = $this->convertToUTF8($content);
try {
$filePath = $this->storeContent($content);
@ -96,8 +100,11 @@ class IngestDocuments implements ShouldQueue
}
}
protected function failed()
public function failed()
{
Log::error('Ingest documents failed.');
// @TODO Delete docx, txt and md files.
if ($this->storage->exists($this->path)) {
$this->storage->delete($this->path);
}
@ -133,7 +140,7 @@ class IngestDocuments implements ShouldQueue
}
);
return utf8_encode(json_encode($content));
return $content;
}
protected function storeContent($content)

7
app/Jobs/SendToCore.php

@ -72,6 +72,13 @@ class SendToCore implements ShouldQueue
}
}
public function failed()
{
if ($this->filePath) {
$this->storage->delete($this->filePath);
}
}
/**
* Send the data to the core trough webhooks
*

87
app/Parser/ParseTextArray.php

@ -125,7 +125,6 @@ class ParseTextArray
} else {
Log::error('The given file dose not exists!');
}
}
@ -144,6 +143,7 @@ class ParseTextArray
$alreadyHandled = [];
$countData = count($textAsArray);
for ($i = 0; $i < $countData; $i++) {
if (array_key_exists($i, $alreadyHandled)) {
continue;
@ -154,16 +154,19 @@ class ParseTextArray
'content' => trim($textAsArray[ $i ]),
'spaces' => strlen($textAsArray[ $i ]) - strlen(ltrim($textAsArray[ $i ]))
];
//Remove numbering from the paragraph content
if ($numbering = $this->getNumbering($textAsArray[ $i ])) {
$data[ $i ][ 'numbering' ] = $numbering;
$data[ $i ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $i ][ 'content' ]), '.'));
}
if ($this->pdf && strpos($textAsArray[ $i ], 'Page') !== false && strpos($textAsArray[ $i ],
'of') !== false) {
$alreadyHandled[] = $i;
break;
}
$j = $i + 1;
if (isset($textAsArray[ $j ])) {
@ -171,67 +174,91 @@ class ParseTextArray
if (array_key_exists($j, $alreadyHandled)) {
continue;
}
if ($this->pdf && isset($textAsArray[ $j ]) && strpos($textAsArray[ $j ],
'Page') !== false && strpos($textAsArray[ $j ], 'of') !== false) {
if (
$this->pdf &&
isset($textAsArray[ $j ]) &&
strpos($textAsArray[ $j ], 'Page') !== false &&
strpos($textAsArray[ $j ], 'of') !== false
) {
$alreadyHandled[] = $j;
continue;
continue;
}
//extract the content and count the number of the empty spaces from the begining
// Extract the content and count the number of the empty spaces from the beginning
$data[ $j ] = [
'content' => trim($textAsArray[ $j ]),
'spaces' => strlen($textAsArray[ $j ]) - strlen(ltrim($textAsArray[ $j ]))
];
//Remove numbering from the paragraph content
// Remove numbering from the paragraph content
if ($numbering = $this->getNumbering($textAsArray[ $j ])) {
$data[ $j ][ 'numbering' ] = $numbering;
$data[ $j ][ 'content' ] = trim(ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]),
'.'));
$data[ $j ][ 'content' ] = trim(
ltrim(str_replace($numbering, '', $data[ $j ][ 'content' ]), '.')
);
}
//break if outh have numbering and the space is equal
if ($data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] && $this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && substr_count($data[ $i ][ 'numbering' ],
'.') == substr_count($data[ $j ][ 'numbering' ],
'.') && count(array_filter(str_split($data[ $i ][ 'numbering' ]),
'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]),
'is_numeric'))) {
// Break if both have numbering and the space is equal
if (
$data[ $j ][ 'spaces' ] == $data[ $i ][ 'spaces' ] &&
$this->hasNumbering($data[ $j ]) &&
$this->hasNumbering($data[ $i ]) &&
substr_count($data[ $i ][ 'numbering' ], '.') == substr_count($data[ $j ][ 'numbering' ], '.') &&
count(array_filter(str_split($data[ $i ][ 'numbering' ]), 'is_numeric')) == count(array_filter(str_split($data[ $j ][ 'numbering' ]), 'is_numeric'))) {
break;
}
if ($this->hasNumbering($data[ $j ]) && ! $this->hasNumbering($data[ $i ]) && ! $data[ $i ][ 'spaces' ] && $data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] && ! in_array(substr($data[ $i ][ 'content' ],
-1), [':'])) {
if (
$this->hasNumbering($data[ $j ]) &&
! $this->hasNumbering($data[ $i ]) &&
! $data[ $i ][ 'spaces' ] &&
$data[ $j ][ 'spaces' ] > $data[ $i ][ 'spaces' ] &&
! in_array(substr($data[ $i ][ 'content' ], -1), [':'])
) {
break;
}
if ($this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1) {
if (
$this->hasNumbering($data[ $j ]) &&
$this->hasNumbering($data[ $i ]) &&
((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
) {
break;
}
if ($this->hasNumbering($data[ $j ]) && $this->hasNumbering($data[ $i ]) && ((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1) {
if (
$this->hasNumbering($data[ $j ]) &&
$this->hasNumbering($data[ $i ]) &&
((float) $data[ $j ][ 'numbering' ] - (float) $data[ $i ][ 'numbering' ]) >= 1
) {
break;
}
//Hardcoded breakpoints
if ($this->hasNumbering($data[ $j ]) && in_array(strtoupper(str_replace(['.', "\t", ""], '',
$data[ $j ][ 'content' ])), $this->breakPoints)) {
// Hardcoded breakpoints
if (
$this->hasNumbering($data[ $j ]) &&
in_array(strtoupper(str_replace(['.', "\t", ""], '', $data[ $j ][ 'content' ])), $this->breakPoints)
) {
break;
}
//Hardcoded "Schedule break"
if (! $this->hasNumbering($data[ $j ]) && strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))),
0, 10), 'schedule') !== false) {
// Hardcoded "Schedule break"
if (
! $this->hasNumbering($data[ $j ]) &&
strpos(substr(trim(strtolower(utf8_encode($data[ $j ][ 'content' ]))), 0, 10), 'schedule') !== false
) {
break;
}
if (! $this->hasNumbering($data[ $j ]) && strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15),
'Exhibit') !== false && ! in_array(substr(trim($data[ $j ][ 'content' ]), -1), [
'.'
])) {
if (
! $this->hasNumbering($data[ $j ]) &&
strpos(substr(trim($data[ $j ][ 'content' ]), 0, 15), 'Exhibit') !== false &&
! in_array(substr(trim($data[ $j ][ 'content' ]), -1), ['.'])
) {
break;
}

Loading…
Cancel
Save