Skip to content

Commit

Permalink
add fonctions
Browse files Browse the repository at this point in the history
  • Loading branch information
Hermann-web authored Aug 27, 2021
1 parent 076aac7 commit 5eaf5e8
Showing 1 changed file with 97 additions and 23 deletions.
120 changes: 97 additions & 23 deletions backend.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,105 @@
import os
from pdf_to_txt import pdf_to_txt
from pdf_to_img import pdf_to_img #import a function to convert into image
from img_to_txt import img_to_txt
import img2pdf #pip install img2pdf
#------------------------------------------------------------------------------------

def get_dir(path):
split_url = path.split('%2f')[1:]
split_url = [elt.replace('%20',' ') for elt in split_url]
split_url[-1] = split_url[-1].split('&')[0]
url_part1 = "//inshare.collab.group.safran@SSL/"
url_part2 = "/".join(split_url)
return url_part1 +url_part2
def pdf_to_txt(pdf_abspath):
list_img_path = pdf_to_img(pdf_abspath) #get first image (page1)
liste_txt = get_string(list_img_path)
return liste_txt



def get_string(list_img_path):
Liste_string=[] #liste des textes_factures
current_index=0 #index à parti duquel on fait le merge des images
current_vendor = '' #nom du fournisseur
current_invoice_num = ''
print('--getting images from path--',end='\n ')
for i,img_path in enumerate(list_img_path):
print(f'page{i}',end = ' ')

textePage = img_to_txt(img_path)
fournisseur = get_vendor_name(textePage)

#check if there is another invoice
if fournisseur and (fournisseur!=current_vendor):
current_vendor = fournisseur
current_invoice_num = get_num_facture(textePage,current_vendor)
Liste_string.append(textePage)
else:
Liste_string[-1]+=textePage
list_img_to_merge = list_img_path[current_index:i];current_index = i
merge_and_store(list_img_to_merge,fournisseur,current_invoice_num)

print('\n')
return Liste_string

def merge_and_store(list_img_to_merge,fournisseur,numero_de_facture):
output_path = f'{fournisseur}&&{numero_de_facture}.pdf'
with open(output_path,"wb") as f:
f.write(img2pdf.convert(list_img_to_merge))
return output_path

TEMPLATE_NUM_FACTURE_STR = 'Template'
DIGIT_STR = 'Number'
CHAR_STR = 'Character' #ex f,g,h
CHAR_UPPER_STR = 'UpperCharacter'
CHAR_LOWER_STR = 'LowerCharacter'

DICT_REGEX = {
DIGIT_STR: '\d',
CHAR_STR: '[a-zA-Z]',
CHAR_UPPER_STR: '[a-zA-Z]',
CHAR_LOWER_STR: '[a-zA-Z]'
}

JSON = {'SEFI':{
TEMPLATE_NUM_FACTURE_STR :(DIGIT_STR,6),
NOMS_POSSIBLES : ['SEFI','SEF'],
}
}




#cette fonction récupère un texte et cherche le numero de facture. Il retourne False au cas échéant
def get_num_facture(textePage,fournisseur):
STOP = ' '
textePage = re.sub(r'\s+',STOP, txt)
template = JSON[fournisseur][TEMPLATE_STR] #template ex: [(DIGIT_STR,5),]
for sequence in x.split(STOP):
if ismatch(sequence,template): return sequence
return False

def ismatch(sequence,template):
regex = ''
for tuple in template:
expression,occurence = tuple[0], tuple[1]
if expression in DICT_REGEX:
regex += DICT_REGEX[expression]
else:
regex += expression
regex += f'{{occurence}}'
return re.match(regex,sequence)


#check in a vendor_name is found, otherwise false
def get_vendor_name(result):
for vendor in JSON:
for vendor_name in Dict[vendor]:
if foundInpage(vendor_name,result):
return vendor
return False


#check in a vendor is found in a page
def foundInpage(vendor,textpage):
return vendor.upper() in textpage.upper()


def get_files_from_folder(folder_url):
dir = get_dir(folder_url)
liste_files_path = os.listdir(dir)
liste_files_abs_path = [os.path.join(dir,elt) for elt in liste_files_path]
return liste_files_abs_path

def process(liste_folders):
for url in liste_folders:
liste_pdf_files_path = get_files_from_folder(folder_url)
for pdf_file_path in liste_pdf_files_path:
liste_text_pages = pdf_to_txt(pdf_file_path)


def test():
URL = "https://inshare.collab.group.safran/bao/CSPFin/_layouts/15/start.aspx#/Processus%20paiement%20et%20trsorerie/Forms/AllItems.aspx?RootFolder=%2fbao%2fCSPFin%2fProcessus%20paiement%20et%20trsorerie%2fSAFRAN%20MAROC%2fTest%20factures&FolderCTID=0x01200005E92C016C86064F9DE0ACDC48CEC253"

import os
dir = get_dir(URL)
print(os.listdir(dir))

0 comments on commit 5eaf5e8

Please sign in to comment.