You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
92 lines
2.0 KiB
92 lines
2.0 KiB
<?php
|
|
|
|
namespace App\Ingest;
|
|
|
|
use Illuminate\Support\Facades\File;
|
|
use Symfony\Component\Process\Exception\ProcessFailedException;
|
|
use Symfony\Component\Process\Process;
|
|
use thiagoalessio\TesseractOCR\TesseractOCR;
|
|
|
|
class OCR
|
|
{
|
|
protected $path;
|
|
|
|
public function __construct($path)
|
|
{
|
|
$this->path = $path;
|
|
}
|
|
|
|
public function execute()
|
|
{
|
|
$this->preProcess();
|
|
|
|
$text = $this->extractText();
|
|
|
|
return $text;
|
|
}
|
|
|
|
protected function preProcess()
|
|
{
|
|
$this->applyDewarp();
|
|
// $this->applyDeskew();
|
|
}
|
|
|
|
protected function applyDewarp()
|
|
{
|
|
$executablePath = resource_path('python/dewarp/page_dewarp.py');
|
|
|
|
$process = new Process([
|
|
'python3',
|
|
$executablePath,
|
|
$this->path,
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if (!$process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$fileName = pathinfo($this->path, PATHINFO_FILENAME);
|
|
$filePath = $fileName . '_thresh.png';
|
|
$directory = pathinfo($this->path, PATHINFO_DIRNAME);
|
|
$newPath = "$directory/$filePath";
|
|
|
|
// The file may not be created by the library for various reasons, including if it does not have text.
|
|
if (File::exists($newPath)) {
|
|
$this->path = $newPath;
|
|
}
|
|
}
|
|
|
|
protected function applyDeskew()
|
|
{
|
|
$executablePath = resource_path('libraries/deskew/Bin/deskew');
|
|
$newPath = pathinfo($this->path, PATHINFO_DIRNAME) . '/deskewed.png';
|
|
|
|
$process = new Process([
|
|
$executablePath,
|
|
$this->path,
|
|
'-o',
|
|
$newPath
|
|
]);
|
|
|
|
$process->run();
|
|
|
|
if ( ! $process->isSuccessful()) {
|
|
throw new ProcessFailedException($process);
|
|
}
|
|
|
|
$this->path = $newPath;
|
|
}
|
|
|
|
protected function extractText()
|
|
{
|
|
$t = new TesseractOCR($this->path);
|
|
|
|
// $t->oem(4);
|
|
|
|
$t->psm(4);
|
|
|
|
return $t->run();
|
|
}
|
|
}
|