Files
studimi_zheni/convert.py

60 lines
1.9 KiB
Python

import pandas as pd
from os import listdir
from os.path import isfile, join
from docx.api import Document
from enum import Enum
from datetime import datetime
class Klinika(Enum):
KIRURGJI = 'kirurgji'
OBS = 'OBS-GYN'
URGJENCA = 'urgjenca'
def convert(klinika: str):
print(f"Filloi {klinika} me {datetime.now()}")
columns = []
filtered_columns = []
docx_path = './Word/'
xlsx_path = './Excel/'
list_files = [f for f in listdir(docx_path) if isfile(join(docx_path, f))]
list_paths = []
df = pd.DataFrame()
for file in list_files:
if klinika in file and "lock" not in file:
list_paths.append(join(docx_path, file))
for path in list_paths:
document = Document(path)
for table in document.tables:
if "urgjenca_2016.docx" in path:
grid = table._tbl.find("w:tblGrid", table._tbl.nsmap)
for cell in table.column_cells(2):
cell._tc.getparent().remove(cell._tc)
col_elem = grid[2]
grid.remove(col_elem)
print(f"U FSHI PAVIONI")
for cell in table.rows[0].cells:
columns.append(cell.text.strip().replace('\n', ' '))
RowA = table.rows[0]
table_element = table._tbl
table_element.remove(RowA._tr)
for row in table.rows:
text = [cell.text for cell in row.cells]
df = df.append([text], ignore_index=True)
print(f"Mbaroi {path} me {datetime.now()}")
for word in columns:
if word not in filtered_columns:
filtered_columns.append(word)
print(f'{len(filtered_columns)} Columns found')
print('writing to excel...')
print(f"Perfundoi {klinika} me {datetime.now()}")
df.columns = filtered_columns
excel_path = join(xlsx_path, f"raw_{klinika}_2016-2019.xlsx")
df.to_excel(excel_path, index=False)
convert(Klinika.URGJENCA.value)