60 lines
1.9 KiB
Python
60 lines
1.9 KiB
Python
import pandas as pd
|
|
from os import listdir
|
|
from os.path import isfile, join
|
|
from docx.api import Document
|
|
from enum import Enum
|
|
from datetime import datetime
|
|
|
|
|
|
class Klinika(Enum):
|
|
KIRURGJI = 'kirurgji'
|
|
OBS = 'OBS-GYN'
|
|
URGJENCA = 'urgjenca'
|
|
|
|
|
|
def convert(klinika: str):
|
|
print(f"Filloi {klinika} me {datetime.now()}")
|
|
columns = []
|
|
filtered_columns = []
|
|
docx_path = './Word/'
|
|
xlsx_path = './Excel/'
|
|
list_files = [f for f in listdir(docx_path) if isfile(join(docx_path, f))]
|
|
list_paths = []
|
|
df = pd.DataFrame()
|
|
for file in list_files:
|
|
if klinika in file and "lock" not in file:
|
|
list_paths.append(join(docx_path, file))
|
|
for path in list_paths:
|
|
document = Document(path)
|
|
for table in document.tables:
|
|
if "urgjenca_2016.docx" in path:
|
|
grid = table._tbl.find("w:tblGrid", table._tbl.nsmap)
|
|
for cell in table.column_cells(2):
|
|
cell._tc.getparent().remove(cell._tc)
|
|
col_elem = grid[2]
|
|
grid.remove(col_elem)
|
|
print(f"U FSHI PAVIONI")
|
|
|
|
for cell in table.rows[0].cells:
|
|
columns.append(cell.text.strip().replace('\n', ' '))
|
|
RowA = table.rows[0]
|
|
table_element = table._tbl
|
|
table_element.remove(RowA._tr)
|
|
for row in table.rows:
|
|
text = [cell.text for cell in row.cells]
|
|
df = df.append([text], ignore_index=True)
|
|
print(f"Mbaroi {path} me {datetime.now()}")
|
|
|
|
for word in columns:
|
|
if word not in filtered_columns:
|
|
filtered_columns.append(word)
|
|
print(f'{len(filtered_columns)} Columns found')
|
|
print('writing to excel...')
|
|
print(f"Perfundoi {klinika} me {datetime.now()}")
|
|
df.columns = filtered_columns
|
|
excel_path = join(xlsx_path, f"raw_{klinika}_2016-2019.xlsx")
|
|
df.to_excel(excel_path, index=False)
|
|
|
|
|
|
convert(Klinika.URGJENCA.value)
|