diff --git a/.env.example b/.env.example index 5490aaf..5955a12 100644 --- a/.env.example +++ b/.env.example @@ -8,16 +8,16 @@ LOG_CHANNEL=stack BROADCAST_DRIVER=log CACHE_DRIVER=file -QUEUE_CONNECTION=sync -SESSION_DRIVER=file +QUEUE_CONNECTION=redis +SESSION_DRIVER=redis SESSION_LIFETIME=120 REDIS_HOST=127.0.0.1 REDIS_PASSWORD=null REDIS_PORT=6379 -REDIS_QUEUE= +REDIS_QUEUE=sd_ingest WEBHOOK_CORE_URL= WEBHOOK_CORE_SECRET= -USER_HOME_PATH= +USER_HOME_PATH=/tmp diff --git a/README.md b/README.md index 92553c5..e9b1ec1 100644 --- a/README.md +++ b/README.md @@ -1,67 +1,67 @@ -## About S&D Ingest - -S&D INGEST it's the module that receives row files in different formats and send's them to any module after the file's are being processed. +## Search and Displace Ingest ## :cyclone: Server Requirements: - php7.4 [https://www.php.net] [LICENSE](https://www.php.net/license/index.php) - apache [https://httpd.apache.org] [LICENSE](hhttps://www.apache.org/licenses/LICENSE-2.0) -- redis [https://redis.io] [LICENSE](https://redislabs.com/legal/licenses/) -- postgresql-server [https://www.postgresql.org] [LICENSE](https://tldrlegal.com/license/postgresql-license-(postgresql)) -- supervisor [http://supervisord.org] [LICENSE](https://github.com/Supervisor/supervisor/blob/master/LICENSES.txt) -- libraoffice [https://www.libreoffice.org] [LICENSE](https://www.libreoffice.org/about-us/licenses) -- python [https://www.python.org/] [LICENSE](https://www.python.org/download/releases/2.7/license/) -- pdftotext [https://github.com/jalan/pdftotext] [LICENSE](https://github.com/jalan/pdftotext/blob/master/LICENSE) - +- python 3.8 [https://www.python.org/] [LICENSE](https://docs.python.org/3/license.html) +- composer [https://getcomposer.org/] [LICENSE](https://github.com/composer/composer/blob/main/LICENSE) ## :zap: Build with: - Laravel Framework ^6.2 ## :rocket: Installation +### Ubuntu Packages ```bash +# LibreOffice +apt-get install python-software-properties +apt-add-repository ppa:libreoffice/ppa +apt-get update +apt-get install libreoffice + +# Python apt-get update apt-get install software-properies-common add-apt-repository ppa:deadsnakes/ppa apt-get install supervisor python3.8 python3.8-dev -apt-get install redis-server -supervisorctl restart all -curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py -python get-pip.py -rm -rf get-pip.py -apt install libpoppler-cpp-dev -pip install --upgrade pip -pip install pdftotext supervisor -systemctl enable supervisor -php artisan queue:deploy-supervisor -systemctl restart supervisor -composer install -npm install -cp .env.example .env -php artisan key:generate -sudo -u postgres psql -postgres=# create database mydb; -postgres=# create user myuser with encrypted password 'mypass'; -postgres=# grant all privileges on database mydb to myuser; -#update the .env with the current postgres credentials -sudo mkdir /var/log/amqp -sudo mkdir /var/log/queue -php artisan migrate -php artisan queue:deploy-supervisor -supervisorctl start all +# Redis +apt-get install redis-server # PDF Convertor +apt-get install libpoppler-cpp-dev apt-get install poppler-utils # Tesseract OCR add-apt-repository ppa:alex-p/tesseract-ocr-devel apt-get update -apt install tesseract-ocr +apt-get install tesseract-ocr # Unpaper apt-get install unpaper +# DOCX to PDF Convertor +apt-get install unoconv +``` + +### Libraries Packages +```bash +# Pip +curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py +python get-pip.py +rm -rf get-pip.py +pip install --upgrade pip + +# Pdftotext +pip install pdftotext + +# Supervisor +pip install supervisor +systemctl enable supervisor +mkdir /var/log/amqp +mkdir /var/log/queue + # Deskew cd DESKEW_INSTALLATION_DIRECTORY cd Bin @@ -72,450 +72,41 @@ pip3 install opencv-python cd DEWARP_INSTALLATION_DIRECTORY pip3 install -r requirements.txt +``` -# MAT2 (Metadata remover) - Not used at the moment -pip3 install mat2 -apt-get install gir1.2-poppler-0.18 +### Install app +```bash +# Generate environment file +cp .env.example .env -# DOCX to PDF Convertor -apt-get install unoconv +# Install backend packages +composer install -``` +# Generate app key +php artisan key:generate + +# Change the value for the QUEUE_CONNECTION to redis, if it is not set already -## Local Usage +# Deploy supervisor +php artisan queue:deploy-supervisor -```python -php artisan serve -php artisan queue:work +supervisorctl start all ``` + +### Search and Displace Core Setup +- Install the `Search and Displace Core` app, found here https://git.law/newroco/searchanddisplace-core +- Get the URL of the `Search and Displace Core` app and add it to the `WEBHOOK_CORE_URL` variable in `.env` +- Add in `.env` the `WEBHOOK_CORE_SECRET` value which needs to be the same value as the `WEBHOOK_CLIENT_SECRET` in + the `Search and Displace Core` app's `.env` file + ## PHP Packages +- cebe/markdown [LICENSE](https://github.com/cebe/markdown/blob/master/LICENSE) - fideloper/proxy [LICENSE](https://github.com/fideloper/TrustedProxy/blob/master/LICENSE.md) - laravel/framework [LICENSE](https://github.com/laravel/framework/blob/7.x/LICENSE.md) - laravel/tinker [LICENSE](https://github.com/laravel/tinker/blob/2.x/LICENSE.md) +- league/html-to-markdown [LICENSE](https://github.com/thephpleague/html-to-markdown/blob/master/LICENSE) - phpoffice/phpword [LICENSE](https://github.com/PHPOffice/PHPWord/blob/0.17.0/LICENSE) - predis/predis [LICENSE](https://github.com/php-enqueue/amqp-bunny/blob/master/LICENSE) - spatie/laravel-webhook-server [LICENSE](https://github.com/spatie/laravel-webhook-server/blob/master/LICENSE.md) - -## Current running process -- [DOC,DOCX,RTF etc..] are first being converted to docx and then converted to .txt using https://www.libreoffice.org -- [PDF] files are converted to .txt -- The resulting .txt file is processed using our own logic/alogorithm and clause breaking point to an array that looks similar to this: -```json - { - "content": "Definitions and Interpretation", - "spaces": 4, - "numbering": "1.", - "children": [ - { - "content": "In this Agreement, the following expressions shall have the meanings set opposite them, unless inconsistent with the context or otherwise specified:", - "spaces": 8, - "numbering": "1.1", - "children": [ - { - "content": "“Agreement” this agreement including all schedules, appendices and exhibits attached herein;", - "spaces": 0 - }, - { - "content": "“Associated Company” any company which is listed in Schedule 2 (as may be amended from time to time in writing) and which is in relation to either party its Parent undertaking or its subsidiary undertaking or a subsidiary undertaking of its Parent undertaking or any other person controlled by it or under the same control (where “control” is to be construed in accordance with section 1124 of the Corporation Tax Act 2010) whether direct or indirect. “Parent undertaking” shall have the meaning attributed thereto in Section 1162 of the Companies Act 2006;", - "spaces": 0 - }, - { - "content": "“Commencement Date” [TBC] “Confidential Information” collectively and individually, all or any document or information of any nature in any format, including oral, written or electronic form relating to either party or their Associated Companies’ or either of their businesses, including technology, Customers, Customer Information, supplier, employees, finances, data, products, services, trade secrets, processes, designs, drawings, diagrams, plans, specifications, formulae, testing procedures, computer software, reports, investigative studies, manuals, assets, costs, prices, marketing opportunities, proprietary information, Know-how, the terms of this Agreement and any other information or material relating to the information described above which (i) is disclosed by either party (or by any individual or legal entity acting in their name or on their behalf, including employees, consultants, sub-contractors, advisors of any kind and agents) or (ii) which comes to the attention of either party (or any individual or legal entity acting in their name or on their behalf, including employees, consultants, sub-contractors, advisors of any kind and agents) during the course of the carrying out of the rights or obligations under this Agreement;", - "spaces": 0 - }, - { - "content": "“Customers” customers of {P1_Name} and/or its Associated Companies from time to time who owe a Debt to {P1_Name};", - "spaces": 0 - }, - { - "content": "“Customer Data” any information given by the Customer directly to the {P2_Name} or its personnel;", - "spaces": 0 - }, - { - "content": "“Customer Information” any Customers’ personal information supplied to the {P2_Name} by or on behalf of {P1_Name} during the performance of this Agreement, including Personal Data, but excluding Customer Data;", - "spaces": 0 - }, - { - "content": "“Data Protection Legislation” all applicable legislation concerning the protection of individuals with regard to the processing of Personal Data and the free movement of such data including the Data Protection Act 1998 and any regulations made under such legislation and any relevant codes of practice and guidance notes issued from time to time by the Information Commissioner (or its successor);", - "spaces": 0 - }, - { - "content": "“Debt” any monies owed by the Customer to {P1_Name} which have remained unpaid by the Customer contrary to the terms and conditions between {P1_Name} and the Customer governing the repayment of sums owed;", - "spaces": 0 - }, - { - "content": "“Debt Management Plan” a plan outlined by the {P2_Name} and agreed by the Customer which details the amount and frequency of payments to be made to each of the Customer’s creditors;", - "spaces": 0 - }, - { - "content": "“Disbursement” total amount remitted to {P1_Name} on a monthly basis by the {P2_Name} for application to the Customer’s account in respect of their Offer;", - "spaces": 0 - }, - { - "content": "“European Economic Area” the European Economic Area comprising of the following countries as at the Commencement Date: Austria; Belgium; Bulgaria; Cyprus; the Czech Republic; Denmark; Estonia; Finland; France; Germany; Greece; Hungary; Ireland; Italy; Latvia; Lithuania; Luxembourg; Malta; the Netherlands; Poland; Portugal; Romania; Slovakia; Slovenia; Spain; Sweden; the United Kingdom; Iceland; Liechtenstein and Norway, as amended from time to time;", - "spaces": 0 - }, - { - "content": "“EU Model Terms” the set of model contractual clauses which the Information Commissioner has authorised for use by Data Controllers (as such term is defined in the Data Protection Legislation) established in the European Union where there is a transfer of Personal Data to Data Processors (as such term is defined in the Data Protection Legislation) outside of the European Economic Area;", - "spaces": 0 - }, - { - "content": "“Facility” the {P2_Name} site authorised by {P1_Name} where the processing and/or storage of Personal Data supplied by {P1_Name} pursuant to this Agreement takes place. For the purposes of this Agreement that site shall be located at {P1_Reg} or such other place as may be notified in writing to {P1_Name} from time to time;", - "spaces": 0 - }, - { - "content": "“Force Majeure” any acts, events, omissions or accidents beyond the reasonable control of either Party, including but not limited to acts of God, extreme adverse weather conditions or natural disaster, war, threat of or preparation for war, armed conflict, imposition of sanctions, embargo, breaking off of diplomatic relations or similar actions, terrorist attack, civil war, civil commotion or riots, nuclear, chemical or biological contamination or sonic boom, compliance with any law, regulation or directive, fire, explosion or accidental damage, failure of plant machinery, machinery, computers or vehicles;", - "spaces": 0 - }, - { - "content": "“Information Commissioner” the independent authority in the UK (or its successor body) which regulates information rights;", - "spaces": 0 - }, - { - "content": "“Initial Period” three (3) years from the Commencement Date;", - "spaces": 0 - }, - { - "content": "“Lending Code” a voluntary code of practice (enforced by the Lending Standards Board) which sets standards for financial institutions and provides consumers with protection and explanation on how such institutions are expected to deal with them day-to-day and in times of financial difficulties;", - "spaces": 0 - }, - { - "content": "“Notification” written notification from the {P2_Name} to {P1_Name} that it has obtained Permission;", - "spaces": 0 - }, - { - "content": "“Offer” a statement of proposed amount to be repaid by the Customer to {P1_Name} in respect of the Customer’s Debt including instalment plans;", - "spaces": 0 - }, - { - "content": "“Payment Break” instance where the Customer fails to make an agreed repayment to the {P2_Name} for payment to their creditors;", - "spaces": 0 - }, - { - "content": "“Permission” written confirmation (which may be confirmation by email or other electronic means) from the Customer to the {P2_Name} that they are appointing the {P2_Name} to act on the Customer’s behalf in the management of the Customer’s Debt and authorising the {P2_Name} to negotiate payment terms with {P1_Name} in respect of the Customer’s Debt and authorising the {P2_Name} to have access to Customer Information;", - "spaces": 0 - }, - { - "content": "“Personal Data” personal data as defined in the Data Protection Legislation;", - "spaces": 0 - }, - { - "content": "“Regulatory Authorities” any body who, from time to time, has competent rule-making, investigatory and/or enforcement powers in relation to the business of {P1_Name} and/or its Associated Companies, including, without limitation, the Financial Conduct Authority, the Consumer Financial Protection Bureau, the Office of Fair Trading, the Information Commissioner’s Office, the Lending Standards Board, UK and US Government departments and organisations, the Office of the Comptroller of Currency, the Federal Reserve and other governmental or non-governmental regulatory authorities in the UK, US or other competent jurisdictions;", - "spaces": 0 - }, - { - "content": "“Regulatory Requirements”", - "spaces": 0 - }, - { - "content": "(a) all applicable laws, statutes, regulations, ordinances or subordinate legislation in force from time to time to which this Agreement or a party is subject;", - "spaces": 4 - }, - { - "content": "(b) the common law as applicable to the parties from time to time;", - "spaces": 4 - }, - { - "content": "(c) all binding court orders, judgements or decrees;", - "spaces": 4 - }, - { - "content": "all applicable directives, policies, rules, orders, code of conduct or practice or applicable guidance (including the Lending Code and the Financial Conduct Authority TCF principles that are binding on a party and that are made or given by any government, an agency thereof, any Regulatory Authority or other regulatory authority, including in the case of the {P2_Name}, laws and rules imposed by local regulatory authorities in the country where it is located;", - "spaces": 0 - }, - { - "content": "“Working Day” any day on which banks in London are open for the transaction of normal business excluding Saturdays, Sundays and bank and public holidays in England and Wales.", - "spaces": 0 - } - ] - }, - { - "content": "In this Agreement:", - "spaces": 8, - "numbering": "1.2", - "children": [ - { - "content": "references to Recitals, Clauses and Schedules and their sub-divisions are to the Recitals to, Clauses of and Schedules to this Agreement and their sub-divisions respectively, unless specified otherwise;", - "spaces": 12, - "numbering": "1.2.1" - }, - { - "content": "the index and headings are included for convenience only and shall not affect the construction or interpretation of this Agreement;", - "spaces": 12, - "numbering": "1.2.2" - }, - { - "content": "words importing gender include the other gender and the singular includes the plural and vice versa;", - "spaces": 12, - "numbering": "1.2.3" - }, - { - "content": "references to persons include individuals, bodies corporate, firms, unincorporated associations and governmental, semi-governmental and local authorities or agencies;", - "spaces": 12, - "numbering": "1.2.4" - }, - { - "content": "references to the words “include”, “including”, “in particular” or similar words or expressions will be construed without limitation and accordingly will not limit the words preceding or following them;", - "spaces": 12, - "numbering": "1.2.5" - }, - { - "content": "where expressions used in this Agreement are not specifically defined and are capable of having a special meaning according to the usage or custom of the card services sector or banking services sector, such expressions are to be interpreted accordingly. Any meaning given in this Agreement to a defined term shall prevail over such other special meaning;", - "spaces": 12, - "numbering": "1.2.6" - }, - { - "content": "references to a “party” or “parties” will mean either {P1_Name} and/or the {P2_Name} as the context requires and references to a third party will mean any person other than the parties;", - "spaces": 12, - "numbering": "1.2.7" - }, - { - "content": "except where expressly stated otherwise, references to any statute, legislation, code of practice or other regulation will include any sub-ordinate legislation and any equivalent regulation in any relevant jurisdiction, as amended, modified, consolidated, re-enacted and/or replaced and in force from time to time;", - "spaces": 12, - "numbering": "1.2.8" - }, - { - "content": "any negative obligation imposed on any party shall be construed as if it were also an obligation not to permit or suffer the act or thing in question and any positive obligation imposed on any party shall be construed as if it were also an obligation to procure that the act or thing in question be done;", - "spaces": 12, - "numbering": "1.2.9" - }, - { - "content": "the Schedules and Appendices (if any) form part of this Agreement and shall be construed and have the same full force and effect as if expressly set out in the body of this Agreement. To the extent only of any conflict or inconsistency between the Clauses, Schedules and Appendices (if any), the Clauses will prevail and the order of precedence will be as follows:", - "spaces": 12, - "numbering": "1.2.10" - }, - { - "content": "1 the provisions of the Clauses;", - "spaces": 16, - "numbering": "1.2.10" - }, - { - "content": "2 the provisions of the Schedules; and", - "spaces": 16, - "numbering": "1.2.10" - }, - { - "content": "3 the provisions of the Appendices.", - "spaces": 16, - "numbering": "1.2.10" - } - ] - } - ] - }, - { - "content": "Obligations of the {P2_Name}", - "spaces": 4, - "numbering": "2.", - "children": [ - { - "content": "The {P2_Name} shall obtain the Permission from the Customer before proceeding with the Debt Management Plan.", - "spaces": 8, - "numbering": "2.1" - }, - { - "content": "Subject at all times to the {P2_Name} being in receipt of the applicable Permission, {P2_Name} shall provide the corresponding Notification to {P1_Name} before or at the time of making the first Offer to {P1_Name}. In the absence of such Permission or Notification {P1_Name} shall not be obliged to provide any Customer Information to the {P2_Name}.", - "spaces": 8, - "numbering": "2.2" - }, - { - "content": "{P1_Name} may request, and the {P2_Name} shall provide, any Permission to {P1_Name} within two (2) Working Days of such request by {P1_Name} to enable {P1_Name} to verify the Permissions stated in the Notifications provided that in the event that {P1_Name} requests ten (10) or more Permissions in any 12 hour period then the {P2_Name} shall provide such Permissions as promptly as is reasonably possible.", - "spaces": 8, - "numbering": "2.3" - }, - { - "content": "Any delay or failure by the {P2_Name} to comply with Clause 2.3 shall be deemed a material breach of this Agreement and the provisions of clause 9.3 shall apply.", - "spaces": 8, - "numbering": "2.4" - }, - { - "content": "Subject to Clause 2.1 and in accordance with the Debt Management Plan, the {P2_Name} shall make an Offer to {P1_Name} for the repayment of the Debt detailing the amount and frequency of proposed payments. Such Offer will be made in accordance with the Lending Code guidelines and based upon the principle of equitable distribution of available income (after priority payments) in line with the amount outstanding to each creditor.", - "spaces": 8, - "numbering": "2.5" - }, - { - "content": "Upon receipt of the Offer from the {P2_Name} {P1_Name} may either;", - "spaces": 8, - "numbering": "2.6", - "children": [ - { - "content": "accept the Offer; or", - "spaces": 12, - "numbering": "2.6.1" - }, - { - "content": "reject the Offer where it considers the offer to be unreasonable by written notice to the {P2_Name}.", - "spaces": 12, - "numbering": "2.6.2" - } - ] - }, - { - "content": "In the event that {P1_Name} accepts an Offer, then the {P2_Name} shall arrange for the Disbursement to be repaid to {P1_Name} in accordance with the Offer within five (5) Working Days of receipt by the {P2_Name} of cleared funds from the Customer.", - "spaces": 8, - "numbering": "2.7" - }, - { - "content": "In the event that {P1_Name} rejects the Offer, then the {P2_Name} shall review the Debt Management Plan and the {P2_Name} may make a new Offer to {P1_Name}.", - "spaces": 8, - "numbering": "2.8" - }, - { - "content": "For the avoidance of doubt nothing in this Agreement constitutes an obligation on {P1_Name} to accept any unreasonable Offer made by the {P2_Name}.", - "spaces": 8, - "numbering": "2.9" - }, - { - "content": "The {P2_Name} shall notify {P1_Name} in writing as soon as reasonably possible:", - "spaces": 8, - "numbering": "2.10", - "children": [ - { - "content": "upon becoming aware of any withdrawal of a Permission or any amendment thereto made by a Customer; and", - "spaces": 12, - "numbering": "2.10.1" - }, - { - "content": "of any circumstance or event which is reasonably likely to materially affect the {P2_Name}’s ability to comply with its obligations under this Agreement.", - "spaces": 12, - "numbering": "2.10.2" - } - ] - }, - { - "content": "Failure by the {P2_Name} to notify {P1_Name} pursuant to Clause 2.10.1 shall be deemed a material breach of this Agreement and the provisions of clause 9.3 shall apply.", - "spaces": 8, - "numbering": "2.11" - }, - { - "content": "The {P2_Name} shall;", - "spaces": 8, - "numbering": "2.12", - "children": [ - { - "content": "at all times act in accordance with and subject to any limitations set out in (i) the Permission and (ii) the requirements of this Agreement;", - "spaces": 12, - "numbering": "2.12.1" - }, - { - "content": "comply with the reporting and review requirements set out in Schedule I.", - "spaces": 12, - "numbering": "2.12.2" - }, - { - "content": "be at all times courteous and business like in its contact with the Customers;", - "spaces": 12, - "numbering": "2.12.3" - }, - { - "content": "use its reasonable commercial endeavours to comply with any reasonable and lawful directions, orders and instructions which {P1_Name} may from time to time give to it in accordance with or to give effect to the provisions of this Agreement;", - "spaces": 12, - "numbering": "2.12.4" - }, - { - "content": "identify, procure and keep in force all permits, certificates, licences, approvals, authorisations and consents which may be necessary in connection with the performance of its obligations under this Agreement;", - "spaces": 12, - "numbering": "2.12.5" - }, - { - "content": "in performing its obligations under this Agreement, ensure that it is knowledgeable about and shall continue to be knowledgable about all Regulatory Requirements and that it shall comply with all Regulatory Requirements and (i) maintain evidence of its compliance with Regulatory Requirements, (ii) take all necessary steps required to comply with such Regulatory Requirements promptly upon becoming aware it is not so complying, and (iii) take all necessary steps to remedy any previous breaches of such Regulatory Requirements;", - "spaces": 12, - "numbering": "2.12.6" - }, - { - "content": "where permitted to do so, promptly notify {P1_Name} in the event that a regulatory body who regulates {P1_Name} or the {P2_Name} conducts an audit or investigation of the {P2_Name} and disclose to {P1_Name} (subject always to the provisions of confidentiality set out at Clause 7) details of any adverse regulatory findings; and", - "spaces": 12, - "numbering": "2.12.7" - }, - { - "content": "co-operate with {P1_Name} and assist them in their dealings with Regulatory Authorities to the extent reasonably required in relation to this Agreement including implementing such measures as are reasonably necessary and appropriate to effect compliance with Regulatory Requirements.", - "spaces": 12, - "numbering": "2.12.8" - } - ] - }, - { - "content": "{P1_Name} acknowledges and accepts that the {P2_Name} may give advice and assistance and provide services and products beyond the scope of the Debt Management Plan to Customers and that the {P2_Name} will not disclose any Customer Data to {P1_Name} without the Customer’s prior consent (which the {P2_Name} is under no obligation to seek).", - "spaces": 8, - "numbering": "2.13" - }, - { - "content": "Any failure or inability of a Customer to agree to or comply with a Debt Management Plan or any other advice or assistance given by the {P2_Name} pursuant to this Agreement shall not cause the {P2_Name} to be in breach of the terms of this Agreement and shall not prevent the {P2_Name} from providing advice for debt negotiations, counselling and management solutions outside of the Services.", - "spaces": 8, - "numbering": "2.14" - }, - { - "content": "The Parties acknowledge that the {P2_Name} is not acting as an agent of {P1_Name} and that it is not a debt collection agent of {P1_Name}.", - "spaces": 8, - "numbering": "2.15" - } - ] - }, - { - "content": "Rights and Obligations of {P1_Name}", - "spaces": 4, - "numbering": "3.", - "children": [ - { - "content": "During the term of this Agreement {P1_Name} shall provide such information and assistance as is reasonably required for the {P2_Name} to perform its obligations under this Agreement.", - "spaces": 8, - "numbering": "2.16" - }, - { - "content": "For the period of six months from termination or expiry of this Agreement {P1_Name} shall not, without the prior written agreement of the {P2_Name}, employ or engage on any basis or offer such employment or engagement to any of the {P2_Name}’s personnel provided that employment or engagement of any member of the {P2_Name}’s personnel pursuant to a bona fide recruitment campaign shall not be a breach of this clause.", - "spaces": 8, - "numbering": "2.17" - }, - { - "content": "{P1_Name} represents and warrants that:", - "spaces": 8, - "numbering": "2.18", - "children": [ - { - "content": "it has the requisite power and authority required by any applicable law or otherwise to enter into this Agreement and to carry out the obligations contemplated by the Agreement reliably and professionally and that the execution and performance of this Agreement has been duly authorised by the required corporate action by {P1_Name};", - "spaces": 12, - "numbering": "2.18.1" - }, - { - "content": "it has and shall maintain during the continuance of this Agreement all necessary rights, licences and consents necessary to provide the Customer Information to the {P2_Name} and to perform its obligations under this Agreement.", - "spaces": 12, - "numbering": "2.18.2" - } - ] - }, - { - "content": "If {P1_Name} notifies the {P2_Name} in writing that amendments are required to be made to this Agreement (including any Schedule hereto) to ensure {P1_Name}’s compliance with its obligations to a Regulatory Authority and/or any Regulatory Requirements (including changes required in order to comply with any rules or guidance (including guidance as to interpretation of such rules) issued or published by or on behalf of such Regulatory Authorities or coming into force from time to time), the {P2_Name} shall be obliged to make such amendments as soon as reasonably practicable and in shall use reasonable commercial endeavours to ensure that such changes are made in sufficient time so as to ensure that {P1_Name} is complying with such obligations.", - "spaces": 8, - "numbering": "2.19" - }, - { - "content": "In the event the {P2_Name} is unable to comply with any amendments as notified to it by {P1_Name} pursuant to Clause 3.1 or fails to comply within a reasonable time then {P1_Name} may terminate this Agreement immediately.", - "spaces": 8, - "numbering": "2.20" - }, - { - "content": "{P1_Name} shall comply with the reporting and review requirements set out in Schedule I.", - "spaces": 8, - "numbering": "2.21" - }, - { - "content": "Notwithstanding Clause 2.21 above {P1_Name} shall not make any changes to a Customer’s {P1_Name} account without direct contact with the Customer. For the avoidance of doubt the Permission shall only relate to the provision of information regarding a Customer’s {P1_Name} account.", - "spaces": 8, - "numbering": "2.22" - } - ] - }, - { - "content": "Conditions", - "spaces": 4, - "numbering": "4.", - "children": [ - { - "content": "It is a condition of this Agreement that each party is entitled to enter into this Agreement and to perform its obligations set out herein.", - "spaces": 8, - "numbering": "2.23" - } - ] - }, -``` - +- spatie/pdf-to-text [LICENSE](https://github.com/spatie/pdf-to-text/blob/main/LICENSE.md) +- thiagoalessio/tesseract_ocr [LICENSE](https://github.com/thiagoalessio/tesseract-ocr-for-php/blob/main/MIT-LICENSE) diff --git a/app/Console/Commands/DeployWorker.php b/app/Console/Commands/DeployWorker.php index 44d7af8..0bed20a 100644 --- a/app/Console/Commands/DeployWorker.php +++ b/app/Console/Commands/DeployWorker.php @@ -44,14 +44,15 @@ class DeployWorker extends Command { $workerName = 'queue-worker-'.str_replace(' ', '-', strtolower(env('APP_NAME'))).'-'.str_replace(' ', '-', strtolower(env('APP_ENV'))); $workerFile = $workerName.'.conf'; + try { Storage::disk('supervisor')->put($workerFile, '[program:'.$workerName.'] process_name=%(program_name)s_%(process_num)02d'); - Storage::disk('supervisor')->append($workerFile, 'command=php '.base_path().'/artisan queue:work'); + Storage::disk('supervisor')->append($workerFile, 'command=php '.base_path().'/artisan queue:listen --queue=sd_ingest,default --tries=2 --timeout=180'); Storage::disk('supervisor')->append($workerFile, 'autostart=true autorestart=true user=www-data -numprocs=1 +numprocs=3 redirect_stderr=true stdout_logfile=/var/log/queue/'.$workerName.'.log'); } catch (Exception $e) { @@ -59,7 +60,9 @@ stdout_logfile=/var/log/queue/'.$workerName.'.log'); return; } + $this->info('supervisor script installed'); + try { exec('sudo supervisorctl reread'); exec('sudo supervisorctl update'); @@ -70,7 +73,7 @@ stdout_logfile=/var/log/queue/'.$workerName.'.log'); return; } - $this->info('queue worker started'); + $this->info('queue worker started'); } } diff --git a/app/Console/Commands/TestMachine.php b/app/Console/Commands/TestMachine.php deleted file mode 100644 index 47118a6..0000000 --- a/app/Console/Commands/TestMachine.php +++ /dev/null @@ -1,57 +0,0 @@ -output->text($current.'=>'.$next); - $this->output->text( implode( PHP_EOL, $output ) ); - $i++; - } - } -} diff --git a/app/Jobs/IngestDocuments.php b/app/Jobs/IngestDocuments.php index 413e309..0e80904 100644 --- a/app/Jobs/IngestDocuments.php +++ b/app/Jobs/IngestDocuments.php @@ -4,11 +4,6 @@ namespace App\Jobs; use App\Ingest\Convertor; use App\Ingest\DataJsonConvertor; -use App\Ingest\DocxReader; -use App\Parser\ParseXml; -use App\Parser\DocxParser\ParseDocx; -use App\Parser\HtmlParser\ParseHtml; -use App\Parser\ParseHtmlArray; use Illuminate\Bus\Queueable; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Bus\Dispatchable; diff --git a/app/Parser/DocxParser/Footer.php b/app/Parser/DocxParser/Footer.php deleted file mode 100644 index fad2d09..0000000 --- a/app/Parser/DocxParser/Footer.php +++ /dev/null @@ -1,17 +0,0 @@ - [ - // 'content' => '<'.$heading.(($inlineStyle) ? ' style="'.$inlineStyle.'"' : '').'>'.$element->getText().''.$heading.'>', - // 'type' => 'title', - //], - // 'type' => 'title', - // 'depth' => (int) $element->getDepth()]; - - } -} diff --git a/app/Parser/DocxParser/Footnote.php b/app/Parser/DocxParser/Footnote.php deleted file mode 100644 index f4f9686..0000000 --- a/app/Parser/DocxParser/Footnote.php +++ /dev/null @@ -1,41 +0,0 @@ -getElements($section); - foreach ($sectionElements as $element) { - try { - $handler = $this->getHandler($element); - } catch (Exception $e) { - throw new Exception($e->getMessage()); - } - finally { - $data = $handler->handle($element); - if ($data) { - $result[] = $handler->handle($element); - } - } - - - } - - if (count($result) > 0) { - //dd($result); - return $result; - } - - return; - } -} diff --git a/app/Parser/DocxParser/Header.php b/app/Parser/DocxParser/Header.php deleted file mode 100644 index 0f3ad30..0000000 --- a/app/Parser/DocxParser/Header.php +++ /dev/null @@ -1,11 +0,0 @@ -getText(); - //if (! is_string($text)) { - // dd($element); - //} - - return [ - 'content' => $this->buildHtmlLink($element, $text), - 'type' => 'link' - ]; - } - - - private function buildHtmlLink($element, $text) - { - return "".$text.""; - } -} diff --git a/app/Parser/DocxParser/ListItemRun.php b/app/Parser/DocxParser/ListItemRun.php deleted file mode 100644 index c4b90c1..0000000 --- a/app/Parser/DocxParser/ListItemRun.php +++ /dev/null @@ -1,77 +0,0 @@ -getElements($list); - - if (count($listElements)) { - foreach ($listElements as $index => $element) { - - //dd($element->getFontStyle()); - - try { - $handler = $this->getHandler($element); - $data = $handler->handle($element); - - if ($data && isset($data[ 'content' ]) && strlen(trim(strip_tags($data[ 'content' ])))) { - $styleName = $list->getParagraphStyle()->getStyleName(); - - if ($index === 0) { - $result[] = [ - 'content' => $data, - 'type' => 'listItemRun', - 'depth' => (int) $list->getDepth(), - 'styleDepth' => $this->getStyleListDepth($styleName), - 'styleName' => $styleName, - 'index' => $list->getElementIndex(), - 'children' => [] - - ]; - } else { - if (isset($result[ count($result) - 1 ])) { - $result[ count($result) - 1 ][ 'content' ][ 'content' ] .= ' '.$data[ 'content' ]; - } else { - $result[] = [ - 'content' => $data, - 'type' => 'listItemRun', - 'depth' => (int) $list->getDepth(), - 'styleDepth' => $this->getStyleListDepth($styleName), - 'styleName' => $styleName, - 'index' => $list->getElementIndex(), - 'children' => [] - ]; - } - } - } - } catch (Exception $e) { - throw new Exception($e->getMessage()); - } - - - } - if ($result) { - if (count($result) === 1) { - $result = reset($result); - $result[ 'content' ][ 'content' ] = '
'.$result[ 'content' ][ 'content' ].'
'; - } - - } - } - - return $result; - } - -} diff --git a/app/Parser/DocxParser/PageBreak.php b/app/Parser/DocxParser/PageBreak.php deleted file mode 100644 index 0ca4373..0000000 --- a/app/Parser/DocxParser/PageBreak.php +++ /dev/null @@ -1,11 +0,0 @@ -parseLoadedDocx($docxFileLoader); - } catch (\Exception $exception) { - dd($exception); - throw new \Exception($exception->getMessage()); - } - - } - - - private function parseLoadedDocx($docx) - { - $styles = 0; - foreach ($docx->getSections() as $page) { - - $handler = $this->getHandler($page); - $paragraphs = $handler->handle($page); - if ($paragraphs) { - foreach ($paragraphs as $index => $paragraph) { - try { - if ($paragraph && $paragraph[ 'type' ] !== 'textBreak' && (isset($paragraph[ 'content' ][ 'type' ]) && $paragraph[ 'content' ][ 'type' ] !== 'textBreak') || $paragraph[ 'type' ] == 'table') { - $result[] = $paragraph; - if (isset($paragraph[ 'styleName' ])) { - $styles++; - } - } - } catch (\Exception $e) { - dd($e); - } - - } - } - } - $depthTypeType = count($result) / 2 <= $styles ? 'styleDepth' : 'depth'; - - return $this->setTheNumbering($result, null, $depthTypeType); - } - - - private function setTheNumbering($paragraphs, $parentNumbering = null, $depthType = 'depth') - { - $result = []; - $paragraphs = $this->buildTheChildrens($paragraphs, $depthType); - for ($index = 0; $index < count($paragraphs); $index++) { - $paragraph = $paragraphs[ $index ]; - try { - if ($paragraph[ 'type' ] !== 'table' && ($paragraph[ $depthType ] === 0 || $parentNumbering) && strpos($paragraph[ 'styleName' ], - 'BodyText') === false) { - - $paragraph[ 'content' ][ 'numbering' ] = ($parentNumbering) ? $parentNumbering.((int) $index + 1).'.' : $this->currentNumberingIndex.'.'; - $paragraph[ 'content' ][ 'numbering_row' ] = ($parentNumbering) ? ((int) $index + 1) : $this->currentNumberingIndex; - - if ($paragraph[ 'children' ] && count($paragraph[ 'children' ])) { - $paragraph[ 'children' ] = $this->setTheNumbering($paragraph[ 'children' ], - $paragraph[ 'content' ][ 'numbering' ], $depthType); - - } - - if (! $parentNumbering) { - - $this->currentNumberingIndex++; - } - - - } elseif (isset($paragraph[ 'content' ][ 'numbering' ]) && isset($paragraph[ 'children' ]) && count($paragraph[ 'children' ])) { - $paragraphs[ $index ] = $this->setChildrenNumbering($paragraphs[ $index ]); - } elseif (isset($paragraphs[ $index ][ 'content' ][ 'numbering' ]) && isset(last($result)[ 'content' ][ 'numbering' ]) && $paragraphs[ $index ][ 'content' ][ 'numbering' ] == last($result)[ 'content' ][ 'numbering' ]) { - - - } - } catch (\Exception $e) { - dd($e); - } - $result[] = $paragraphs[ $index ]; - - } - - return $result; - } - - - /** - * @param $parent - * - * @return mixed - */ - private function setChildrenNumbering($parent) - { - - $numbering = 1; - for ($j = 0; $j < count($parent[ 'children' ]); $j++) { - $children = $parent[ 'children' ][ $j ]; - - if ($children[ 'type' ] == 'listItemRun' || isset($children[ 'content' ][ 'numbering' ])) { - $parentNumber = $parent[ 'content' ][ 'numbering' ]; - $parent[ 'children' ][ $j ][ 'content' ][ 'numbering' ] = (substr(trim($parentNumber), - strlen(trim($parentNumber)) - 1) == '.') ? $parentNumber.$numbering : $parentNumber.'.'.$numbering; - if (count($parent[ 'children' ][ $j ][ 'children' ])) { - - $parent[ 'children' ][ $j ] = $this->setChildrenNumbering($parent[ 'children' ][ $j ]); - } - - $numbering++; - } - } - - return $parent; - } - - - /** - * @param $paragraphs - * - * @return array - */ - private function buildTheChildrens($paragraphs, $depthType) - { - $alreadyHandledIndexes = []; - $result = []; - - for ($i = 0; $i < count($paragraphs); $i++) { - - if (in_array($i, $alreadyHandledIndexes)) { - continue; - } - $j = $i + 1; - - for ($j; $j < count($paragraphs); $j++) { - - if (in_array($j, $alreadyHandledIndexes)) { - continue; - } - - if (isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && $paragraphs[ $j ][ 'content' ][ 'content' ] === '') { - $alreadyHandledIndexes[] = $j; - $j++; - } - - if (isset($paragraphs[ $i ][ $depthType ]) && isset($paragraphs[ $j ][ $depthType ]) && $paragraphs[ $i ][ $depthType ] !== null && $paragraphs[ $j ][ $depthType ] !== null && $paragraphs[ $i ][ $depthType ] < $paragraphs[ $j ][ $depthType ]) { - - $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i, - $depthType); - - - } elseif (isset($paragraphs[ $j ][ 'styleName' ]) && $paragraphs[ $j ][ 'styleName' ] === 'ListParagraph' && $paragraphs[ $i ][ $depthType ] === null && substr(strip_tags($paragraphs[ $i ][ 'content' ][ 'content' ]), - -1) === ':') { - $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i, - $depthType); - - } elseif (isset($paragraphs[ $j + 1 ]) && isset($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]) && isset($paragraphs[ $j ]) && isset($paragraphs[ $j ][ 'content' ][ 'content' ]) && substr(strip_tags($paragraphs[ $j ][ 'content' ][ 'content' ]), - -1) === ':' && (isset($paragraphs[ $j + 1 ]) && ctype_lower(substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])), - 0, - 1)) || (isset($paragraphs[ $j + 1 ]) && substr(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ])), - strlen(trim(strip_tags($paragraphs[ $j + 1 ][ 'content' ][ 'content' ]))) - 1) == ';'))) { - $k = $j + 1; - $alreadyHandledIndexes[] = $k; - while (isset($paragraphs[ $k ]) && substr(str_replace('and', '', - trim(strip_tags(str_replace('and', '', $paragraphs[ $k ][ 'content' ][ 'content' ])))), - strlen(str_replace('and', '', trim(strip_tags(str_replace('and', '', - $paragraphs[ $k ][ 'content' ][ 'content' ]))))) - 1) == ';') { - $paragraphs[ $j ][ 'children' ][] = $paragraphs[ $k ]; - $alreadyHandledIndexes[] = $k++; - - } - - $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i, - $depthType); - - - } elseif (isset($paragraphs[ $i ][ 'styleName' ]) && $paragraphs[ $i ][ $depthType ] !== $paragraphs[ $j ][ $depthType ] && strpos($paragraphs[ $i ][ 'styleName' ], - 'Heading2') !== false && ((isset($paragraphs[ $j ][ 'depth' ]) || ($paragraphs[ $j ][ 'type' ] == 'textRun' && isset($paragraphs[ $j ][ 'content' ][ 'numbering' ])) && is_null($paragraphs[ $j ][ 'styleName' ])))) { - - $paragraphs[ $i ] = $this->handlePossibleChild($paragraphs[ $i ], $paragraphs[ $j ], $i, - $depthType); - - - } else { - - break; - } - - $alreadyHandledIndexes[] = $j; - - } - $result[] = $paragraphs[ $i ]; - $alreadyHandledIndexes[] = $i; - - } - - return $result; - } - - - /** - * @param $parent - * @param $child - * @param $i - * - * @return mixed - */ - private function handlePossibleChild($parent, $child, $i, $depthType) - { - - // Must iterate through parent children - if (isset($parent[ 'children' ]) && count($parent[ 'children' ]) === 0) { - if ($parent[ $depthType ] < $child[ $depthType ] || $parent[ $depthType ] === null) { - $parent[ 'children' ][] = $child; - } elseif (strpos($parent[ 'styleName' ], - 'Heading') !== false && isset($child[ 'content' ][ 'numbering' ]) && substr_count($child[ 'content' ][ 'numbering' ], - '.') == 1) { - $parent[ 'children' ][] = $child; - } else { - return $parent; - } - - return $parent; - } - - $lastParentChild = last($parent[ 'children' ]); - // Possible to be either child or grandchild - if ($lastParentChild[ $depthType ] && $child[ $depthType ] > $lastParentChild[ $depthType ]) { - - $lastParentChild = $this->handlePossibleChild($lastParentChild, $child, $i, $depthType); - - } else { - - if ($child[ $depthType ] === $lastParentChild[ $depthType ]) { - $parent[ 'children' ][] = $child; - - return $parent; - } - - if (((isset($lastParentChild[ 'styleDepth' ]) && $lastParentChild[ 'styleDepth' ] === $child[ 'depth' ])) && $lastParentChild[ 'index' ] !== $child[ 'index' ]) { - - $parent[ 'children' ][] = $child; - - return $parent; - } - } - - $parent[ 'children' ][ count($parent[ 'children' ]) - 1 ] = $lastParentChild; - - return $parent; - - } - -} diff --git a/app/Parser/DocxParser/PreserveText.php b/app/Parser/DocxParser/PreserveText.php deleted file mode 100644 index e5e4778..0000000 --- a/app/Parser/DocxParser/PreserveText.php +++ /dev/null @@ -1,32 +0,0 @@ -getText(); - if (is_array($text)) { - $text = implode(' ', $text); - } - - return [ - 'content' => [ - 'content' => preg_replace("/\{[^)]+\}/", '{REF_NUMBER}', $text, 1), - 'type' => 'text' - ], - 'type' => 'preserveText', - 'index' => $element->getElementIndex(), - 'children' => [], - 'styleName' => 'Level2Number', - 'styleDepth' => 1, - 'depth' => 0 - ]; - } -} diff --git a/app/Parser/DocxParser/Section.php b/app/Parser/DocxParser/Section.php deleted file mode 100644 index 84cbdd8..0000000 --- a/app/Parser/DocxParser/Section.php +++ /dev/null @@ -1,41 +0,0 @@ -getElements($section); - foreach ($sectionElements as $element) { - - try { - $handler = $this->getHandler($element); - } catch (Exception $e) { - throw new Exception($e->getMessage()); - } - $data = $handler->handle($element); - if($data){ - $result[] = $handler->handle($element); - } - - } - } - - - if (count($result) > 0) { - return $result; - } - - return; - } -} diff --git a/app/Parser/DocxParser/Table.php b/app/Parser/DocxParser/Table.php deleted file mode 100644 index 28150b2..0000000 --- a/app/Parser/DocxParser/Table.php +++ /dev/null @@ -1,35 +0,0 @@ -getRows() as $row) { - $handlerName = "\App\Parser\DocxParser\\".substr(strrchr(__CLASS__, "\\"), - 1).'\\'.$this->getReflectionClass($row); - $handler = new $handlerName; - $data = $handler->handle($row); - if ($data) { - $result [] = $handler->handle($row); - } - } - - //dd($table->getNestedLevel(),get_class_methods($table)); - // - return [ - 'content' => '', - 'children' => $result, - 'styleDepth' => $table->getNestedLevel() + 1, - 'depth' => $table->getNestedLevel() + 1, - 'type' => 'table', - ]; - } -} diff --git a/app/Parser/DocxParser/Table/Cell.php b/app/Parser/DocxParser/Table/Cell.php deleted file mode 100644 index 10dd132..0000000 --- a/app/Parser/DocxParser/Table/Cell.php +++ /dev/null @@ -1,41 +0,0 @@ -getElements($cell); - foreach ($this->getElements($cell) as $index => $element) { - if (! $element instanceof TextBreak) { - try { - $handler = $this->getHandler($element); - } catch (Exception $e) { - throw new Exception($e->getMessage()); - } - $data = $handler->handle($element); - $data['width']= $cell->getWidth(); - $result[] = $data; - } - - } - - return [ - 'content' => '', - 'children' => $result, - 'depth' => null, - 'type' => 'cell', - ]; - - } -} diff --git a/app/Parser/DocxParser/Table/Row.php b/app/Parser/DocxParser/Table/Row.php deleted file mode 100644 index a035c01..0000000 --- a/app/Parser/DocxParser/Table/Row.php +++ /dev/null @@ -1,41 +0,0 @@ -getCells(); - $result = []; - foreach ($rows as $index => $cell) { - $handler = new Cell(); - $result[] = $handler->handle($cell); - - - } - - return [ - 'content' => '', - 'children' => $result, - 'depth' => null, - 'height' => $row->getHeight(), - 'isTblHeader' => $row->getStyle()->isTblHeader(), - 'index' => $row->getElementIndex(), - 'type' => 'row', - ]; - - - } -} diff --git a/app/Parser/DocxParser/Text.php b/app/Parser/DocxParser/Text.php deleted file mode 100644 index 83b3595..0000000 --- a/app/Parser/DocxParser/Text.php +++ /dev/null @@ -1,147 +0,0 @@ -getElementData($textElement); - $data[ 'type' ] = 'text'; - - return $data; - } - - - /** - * @param $textElement - * - * @return array - */ - private function getElementData($textElement) - { - $text = $textElement->getText(); - //if (strpos($text, 'PPOINTMENT AND GRANT OF LICENSE') !== false) { - // dd($textElement->getParent()->getDepth()); - //} - $textData = $this->getNumberingFromText($text); - - if (strlen($textData[ 'content' ])) { - $textData[ 'content' ] = $this->styleTheText($textData[ 'content' ], $textElement); - } - - - - return $textData; - - } - - - /** - * @param $text - * - * @return array - */ - private function getNumberingFromText($text) - { - - $data = []; - preg_match('/^([0-9.])([^(A-Z)(a-z) ]*)/', trim($text), $match); - - if ($match && isset($match[ 0 ]) && $match[ 0 ] !== '.') { - $data[ 'content' ] = trim(str_replace($match[ 0 ], '', $text)); - $data[ 'numbering' ] = $match[ 0 ]; - } else { - $data[ 'content' ] = trim(preg_replace('/\t+/', '', $text)); - } - - return $data; - } - - - private function styleTheText($textString, $textObject) - { - - $textStyle = [ - 'font' => $textObject->getFontStyle(), - 'paragraph' => $textObject->getParagraphStyle() - ]; - - $fontStyle = $textStyle[ 'font' ]->getStyleValues(); - $inlineStyle = $this->getInlineStyles(array_merge($fontStyle[ 'style' ], $fontStyle[ 'basic' ])); - - return ''.$this->getStyledText($textString, - $fontStyle[ 'style' ]).''; - } - - - /** - * @param $styles - * - * @return string - */ - private function getInlineStyles($styles) - { - $styleString = ''; - $acceptedInline = [ - "dStrike" => 'text-decoration: line-through;text-decoration-style: double;', - "smallCaps" => 'text-transform: lowercase;', - "allCaps" => 'text-transform: capitalize;', - "fgColor" => 'background-color:'.$styles[ 'fgColor' ].';', - "hidden" => 'display:none;', - "size" => 'font-size:'.$styles[ 'size' ].'pt;', - "color" => 'color:#'.$styles[ 'color' ].';' - ]; - - foreach ($styles as $style => $value) { - if (array_key_exists($style, $acceptedInline) && $value && ! in_array($value, ['none', 'auto'])) { - $styleString .= $acceptedInline[ $style ]; - } - } - - return $styleString; - } - - - /** - * @param $text - * @param $styles - * - * @return string - */ - private function getStyledText($text, $styles) - { - $mappedStyle = [ - 'bold' => 'strong', - 'italic' => 'i', - 'underline' => 'u', - 'strike' => 'strike', - "super" => 'sup', - "sub" => 'sub', - ]; - foreach ($styles as $style => $active) { - if (array_key_exists($style, $mappedStyle) && $active && $active !== 'none') { - $text = $this->appendHtmlStyle($text, $mappedStyle[ $style ]); - } - } - - return $text; - } - - - /** - * @param $text - * @param $styleType - * - * @return string - */ - private function appendHtmlStyle($text, $styleType) - { - return "<$styleType>$text$styleType>"; - } -} diff --git a/app/Parser/DocxParser/TextBreak.php b/app/Parser/DocxParser/TextBreak.php deleted file mode 100644 index b7bdd8a..0000000 --- a/app/Parser/DocxParser/TextBreak.php +++ /dev/null @@ -1,17 +0,0 @@ - ''.$result[ 'content' ][ 'content' ].'
'; - } - - } - } - - return $result; - - } -} diff --git a/app/Parser/DocxParser/Title.php b/app/Parser/DocxParser/Title.php deleted file mode 100644 index 05143be..0000000 --- a/app/Parser/DocxParser/Title.php +++ /dev/null @@ -1,72 +0,0 @@ -getText(); - if (! is_string($title)) { - $handler = $this->getHandler($title); - - return $handler->handle($title); - } - - //dd($element->getText(),get_class_methods($element),$element->getDepth()); - - $style = $this->getTitleStyle($element); - $headings = [ - 'Title' => 'h1', - 'Subtitle' => 'h2', - 'Heading1' => 'h1', - 'Heading2' => 'h2', - 'Heading3' => 'h3', - 'Heading4' => 'h4', - 'Heading5' => 'h5', - ]; - $fontStyle = $style[ 'font' ]->getStyleValues(); - $inlineStyle = $this->getInlineStyles(array_merge($fontStyle[ 'style' ], $fontStyle[ 'basic' ])); - $heading = array_key_exists($style[ 'heading' ], $headings) ? $headings[ $style[ 'heading' ] ] : 'h5'; - - return [ - 'content' => [ - 'content' => '<'.$heading.(($inlineStyle) ? ' style="'.$inlineStyle.'"' : '').'>'.$element->getText().''.$heading.'>', - 'type' => 'title', - ], - 'type' => 'title', - 'depth' => null, - 'styleDepth' => $this->getStyleListDepth($element->getStyle()), - 'styleName' => $element->getStyle(), - 'index' => $element->getElementIndex(), - 'children' => [] - ]; - - } - - - private function getTitleStyle($element) - { - if (strpos($element->getStyle(), 'Heading') !== false) { - $font = Style::getStyle(str_replace('Heading', 'Heading_', $element->getStyle())); - } else { - $font = Style::getStyle($element->getStyle()); - } - - return [ - 'font' => $font, - 'heading' => $element->getStyle() - ]; - } -} diff --git a/app/Parser/DocxParser/Traits/Helper.php b/app/Parser/DocxParser/Traits/Helper.php deleted file mode 100644 index 070cc61..0000000 --- a/app/Parser/DocxParser/Traits/Helper.php +++ /dev/null @@ -1,117 +0,0 @@ -getReflectionClass($element); - } catch (\Exception $exception) { - throw new \Exception($exception->getMessage()); - } - $handleClass = 'App\Parser\DocxParser\\'.$reflectClass; - if (class_exists($handleClass)) { - return new $handleClass; - } else { - throw new \Exception("Handler class $handleClass dose not exists!"); - } - } - - - /** - * @param $element - * - * @return string - */ - public function getReflectionClass($element) - { - try { - $reflectClass = new ReflectionClass($element); - } catch (\ReflectionException $e) { - throwException($e); - } - - return $reflectClass->getShortName(); - } - - - /** - * Get the child elements of an element - * - * @param $element - * - * @return mixed - */ - public function getElements($element) - { - return $element->getElements(); - } - - - /** - * Check if an element has childrens - * - * @param $element - * - * @return bool - */ - public function hasElements($element) - { - return (bool) count($this->getElements($element)); - } - - - /** - * @param $styles - * - * @return string - */ - private function getInlineStyles($styles) - { - $styleString = ''; - $acceptedInline = [ - "dStrike" => 'text-decoration: line-through;text-decoration-style: double;', - "smallCaps" => 'text-transform: lowercase;', - "allCaps" => 'text-transform: capitalize;', - "fgColor" => 'background-color:'.$styles[ 'fgColor' ].';', - "hidden" => 'display:none;', - "size" => 'font-size:'.$styles[ 'size' ].'pt;', - "color" => 'color:#'.$styles[ 'color' ].';' - ]; - - foreach ($styles as $style => $value) { - if (array_key_exists($style, $acceptedInline) && $value && ! in_array($value, ['none', 'auto'])) { - $styleString .= $acceptedInline[ $style ]; - } - } - - return $styleString; - } - - - public function getStyleListDepth($styleName) - { - - $getNumberFromStyleName = filter_var($styleName, FILTER_SANITIZE_NUMBER_FLOAT, FILTER_FLAG_ALLOW_FRACTION); - if (is_numeric($getNumberFromStyleName) && strpos(strtolower($styleName), 'definition') === false) { - $depth = (int) $getNumberFromStyleName - 1; - - } else { - $depth = null; - - } - - return $depth; - } -} diff --git a/app/Parser/HtmlParser/ParseHtml.php b/app/Parser/HtmlParser/ParseHtml.php deleted file mode 100644 index 3b599be..0000000 --- a/app/Parser/HtmlParser/ParseHtml.php +++ /dev/null @@ -1,527 +0,0 @@ -loadHTML($htmlString); - $htmlDom->preserveWhiteSpace = false; - - return $this->parseLoadedHtml($htmlDom); - } catch (\Exception $exception) { - dd($exception); - } - } - - - private function parseLoadedHtml($htmlDom) - { - $response = []; - $page = $htmlDom->getElementsByTagName("body")[ 0 ]; - $dataStructuredArray = $this->buildTheParsedResponse($this->domToArray($page)); - foreach ($dataStructuredArray as $index => $item) { - if (isset($item[ '_type' ]) && $item[ '_type' ] !== 'table') { - $data = $this->handleChildrens($item); - if (isset($data[ 'content' ])) { - - $data[ 'content' ] = $this->closetags($data[ 'content' ]); - $data[ 'clean_content' ] = preg_replace("/(\r\n|\t|\r|\n)+/", " ", strip_tags($data[ 'content' ])); - $response[] = $data; - } - } - - } - - return $this->fixChildrenStructure($response); - } - - - private function domToArray($root) - { - $result = []; - - //handle classic node - if ($root->nodeType == XML_ELEMENT_NODE) { - $result[ '_type' ] = $root->nodeName; - if ($root->nodeName === 'ol') { - if ($root->hasAttribute('start')) { - $result[ '_startFrom' ] = $root->getAttribute('start'); - } else { - $result[ '_startFrom' ] = 1; - } - } - $result[ '_numberOfChildren' ] = $root->childNodes->length; - if ($root->hasChildNodes()) { - $children = $root->childNodes; - for ($i = 0; $i < $children->length; $i++) { - $child = $this->domToArray($children->item($i)); - - //don't keep textnode with only spaces and newline - if (! empty($child)) { - $result[ '_children' ][] = $child; - } - } - } - - //handle text node - } elseif ($root->nodeType == XML_TEXT_NODE || $root->nodeType == XML_CDATA_SECTION_NODE) { - $value = $root->nodeValue; - if (! empty($value)) { - $cleanText = preg_replace("/(\r\n|\t|\r|\n)+/", " ", $value); - if (! empty(str_replace(' ', '', $cleanText))) { - $result[ '_type' ] = '_text'; - $result[ '_content' ] = ltrim($cleanText); - } - - } - } - - //list attributes - if ($root->hasAttributes()) { - foreach ($root->attributes as $attribute) { - $result[ '_attributes' ][ $attribute->name ] = $attribute->value; - } - } - - return $result; - } - - - private function buildTheParsedResponse(array $htmElementsAsArray): array - { - $parsedResponse = []; - foreach ($htmElementsAsArray[ '_children' ] as $index => $elementArray) { - $data = []; - if ($elementArray[ '_type' ] === '_text') { - $data[ '_type' ] = $elementArray[ '_type' ]; - $data[ 'content' ] = $this->parseParagraph($elementArray); - } elseif (isset($elementArray[ '_children' ])) { - - - $parsedResponseData = $this->buildTheParsedResponse($elementArray); - if (! empty($parsedResponseData)) { - $data[ '_type' ] = $elementArray[ '_type' ]; - if (in_array($elementArray[ '_type' ], ['ul', 'ol'])) { - if (isset($elementArray[ '_startFrom' ])) { - $data[ 'start' ] = $elementArray[ '_startFrom' ]; - } - $data [ 'children' ] = $parsedResponseData; - } else { - - $data [ 'content' ] = $parsedResponseData; - } - - } - - } - if (! empty($data)) { - if (isset($elementArray[ '_attributes' ])) { - $data[ '_attributes' ] = $elementArray[ '_attributes' ]; - } - $parsedResponse[] = $data; - } - } - - return $parsedResponse; - } - - - private function remove_empty_tags_recursive($str, $repto = null) - { - //** Return if string not given or empty. - if (! is_string($str) || trim($str) == '') { - return $str; - } - - //** Recursive empty HTML tags. - return preg_replace( - - //** Pattern written by Junaid Atari. - '/<([^<\/>]*)>([\s]*?|(?R))<\/\1>/imsU', - - //** Replace with nothing if string empty. - ! is_string($repto) ? '' : $repto, - - //** Source string - $str); - } - - - private function closetags($text) - { - $tagstack = []; - $stacksize = 0; - $tagqueue = ''; - $newtext = ''; - // Known single-entity/self-closing tags. - $single_tags = [ - 'area', - 'base', - 'basefont', - 'br', - 'col', - 'command', - 'embed', - 'frame', - 'hr', - 'img', - 'input', - 'isindex', - 'link', - 'meta', - 'param', - 'source' - ]; - // Tags that can be immediately nested within themselves. - $nestable_tags = ['blockquote', 'div', 'object', 'q', 'span']; - - // WP bug fix for comments - in case you REALLY meant to type '< !--'. - $text = str_replace('< !--', '< !--', $text); - // WP bug fix for LOVE <3 (and other situations with '<' before a number). - $text = preg_replace('#<([0-9]{1})#', '<$1', $text); - - /** - * Matches supported tags. - * - * To get the pattern as a string without the comments paste into a PHP - * REPL like `php -a`. - * - * @see https://html.spec.whatwg.org/#elements-2 - * @see https://w3c.github.io/webcomponents/spec/custom/#valid-custom-element-name - * - * @example - * ~# php -a - * php > $s = [paste copied contents of expression below including parentheses]; - * php > echo $s; - */ - $tag_pattern = ('#<'. // Start with an opening bracket. - '(/?)'. // Group 1 - If it's a closing tag it'll have a leading slash. - '('. // Group 2 - Tag name. - // Custom element tags have more lenient rules than HTML tag names. - '(?:[a-z](?:[a-z0-9._]*)-(?:[a-z0-9._-]+)+)'.'|'.// Traditional tag rules approximate HTML tag names. - '(?:[\w:]+)'.')'.'(?:'.// We either immediately close the tag with its '>' and have nothing here. - '\s*'.'(/?)'. // Group 3 - "attributes" for empty tag. - '|'.// Or we must start with space characters to separate the tag name from the attributes (or whitespace). - '(\s+)'. // Group 4 - Pre-attribute whitespace. - '([^>]*)'. // Group 5 - Attributes. - ')'.'>#' // End with a closing bracket. - ); - - while (preg_match($tag_pattern, $text, $regex)) { - $full_match = $regex[ 0 ]; - $has_leading_slash = ! empty($regex[ 1 ]); - $tag_name = $regex[ 2 ]; - $tag = strtolower($tag_name); - $is_single_tag = in_array($tag, $single_tags, true); - $pre_attribute_ws = isset($regex[ 4 ]) ? $regex[ 4 ] : ''; - $attributes = trim(isset($regex[ 5 ]) ? $regex[ 5 ] : $regex[ 3 ]); - $has_self_closer = '/' === substr($attributes, -1); - - $newtext .= $tagqueue; - - $i = strpos($text, $full_match); - $l = strlen($full_match); - - // Clear the shifter. - $tagqueue = ''; - if ($has_leading_slash) { // End tag. - // If too many closing tags. - if ($stacksize <= 0) { - $tag = ''; - // Or close to be safe $tag = '/' . $tag. - - // If stacktop value = tag close value, then pop. - } elseif ($tagstack[ $stacksize - 1 ] === $tag) { // Found closing tag. - $tag = ''.$tag.'>'; // Close tag. - array_pop($tagstack); - $stacksize--; - } else { // Closing tag not at top, search for it. - for ($j = $stacksize - 1; $j >= 0; $j--) { - if ($tagstack[ $j ] === $tag) { - // Add tag to tagqueue. - for ($k = $stacksize - 1; $k >= $j; $k--) { - $tagqueue .= ''.array_pop($tagstack).'>'; - $stacksize--; - } - break; - } - } - $tag = ''; - } - } else { // Begin tag. - if ($has_self_closer) { // If it presents itself as a self-closing tag... - // ...but it isn't a known single-entity self-closing tag, then don't let it be treated as such - // and immediately close it with a closing tag (the tag will encapsulate no text as a result). - if (! $is_single_tag) { - $attributes = trim(substr($attributes, 0, -1)).">$tag"; - } - } elseif ($is_single_tag) { // Else if it's a known single-entity tag but it doesn't close itself, do so. - $pre_attribute_ws = ' '; - $attributes .= '/'; - } else { // It's not a single-entity tag. - // If the top of the stack is the same as the tag we want to push, close previous tag. - if ($stacksize > 0 && ! in_array($tag, $nestable_tags, - true) && $tagstack[ $stacksize - 1 ] === $tag) { - $tagqueue = ''.array_pop($tagstack).'>'; - $stacksize--; - } - $stacksize = array_push($tagstack, $tag); - } - - // Attributes. - if ($has_self_closer && $is_single_tag) { - // We need some space - avoid