Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 

92 lines
2.0 KiB

<?php
namespace App\Ingest;
use Illuminate\Support\Facades\File;
use Symfony\Component\Process\Exception\ProcessFailedException;
use Symfony\Component\Process\Process;
use thiagoalessio\TesseractOCR\TesseractOCR;
class OCR
{
protected $path;
public function __construct($path)
{
$this->path = $path;
}
public function execute()
{
$this->preProcess();
$text = $this->extractText();
return $text;
}
protected function preProcess()
{
$this->applyDewarp();
// $this->applyDeskew();
}
protected function applyDewarp()
{
$executablePath = resource_path('python/dewarp/page_dewarp.py');
$process = new Process([
'python3',
$executablePath,
$this->path,
]);
$process->run();
if (!$process->isSuccessful()) {
throw new ProcessFailedException($process);
}
$fileName = pathinfo($this->path, PATHINFO_FILENAME);
$filePath = $fileName . '_thresh.png';
$directory = pathinfo($this->path, PATHINFO_DIRNAME);
$newPath = "$directory/$filePath";
// The file may not be created by the library for various reasons, including if it does not have text.
if (File::exists($newPath)) {
$this->path = $newPath;
}
}
protected function applyDeskew()
{
$executablePath = resource_path('libraries/deskew/Bin/deskew');
$newPath = pathinfo($this->path, PATHINFO_DIRNAME) . '/deskewed.png';
$process = new Process([
$executablePath,
$this->path,
'-o',
$newPath
]);
$process->run();
if ( ! $process->isSuccessful()) {
throw new ProcessFailedException($process);
}
$this->path = $newPath;
}
protected function extractText()
{
$t = new TesseractOCR($this->path);
// $t->oem(4);
$t->psm(4);
return $t->run();
}
}