Repo for the search and displace ingest module that takes odf, docx and pdf and transforms it into .md to be used with search and displace operations
You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
|
|
import pdftotext,json,sys, getopt
def main(argv): inputfile = '' outputfile = '' try: opts, args = getopt.getopt(argv,"hi:o:",["ifile=","ofile="]) except getopt.GetoptError: print('parse-pdf.py -i <inputfile> -o <outputfile>') sys.exit(2) for opt, arg in opts: if opt == '-h': print('parse-pdf.py -i <inputfile> -o <outputfile>') sys.exit() elif opt in ("-i", "--ifile"): inputfile = arg elif opt in ("-o", "--ofile"): outputfile = arg
# Load your PDF with open(inputfile, "rb") as file: pdf = pdftotext.PDF(file) outFile = open(outputfile,'w',encoding='UTF-8') outFile.write("\n\n".join(pdf)) outFile.close()
if __name__ == "__main__": main(sys.argv[1:])
|