import pandas as pd from os import listdir from os.path import isfile, join from docx.api import Document from enum import Enum from datetime import datetime class Klinika(Enum): KIRURGJI = 'kirurgji' OBS = 'OBS-GYN' URGJENCA = 'urgjenca' def convert(klinika: str): print(f"Filloi {klinika} me {datetime.now()}") columns = [] filtered_columns = [] docx_path = './Word/' xlsx_path = './Excel/' list_files = [f for f in listdir(docx_path) if isfile(join(docx_path, f))] list_paths = [] df = pd.DataFrame() for file in list_files: if klinika in file and "lock" not in file: list_paths.append(join(docx_path, file)) for path in list_paths: document = Document(path) for table in document.tables: if "urgjenca_2016.docx" in path: grid = table._tbl.find("w:tblGrid", table._tbl.nsmap) for cell in table.column_cells(2): cell._tc.getparent().remove(cell._tc) col_elem = grid[2] grid.remove(col_elem) print(f"U FSHI PAVIONI") for cell in table.rows[0].cells: columns.append(cell.text.strip().replace('\n', ' ')) RowA = table.rows[0] table_element = table._tbl table_element.remove(RowA._tr) for row in table.rows: text = [cell.text for cell in row.cells] df = df.append([text], ignore_index=True) print(f"Mbaroi {path} me {datetime.now()}") for word in columns: if word not in filtered_columns: filtered_columns.append(word) print(f'{len(filtered_columns)} Columns found') print('writing to excel...') print(f"Perfundoi {klinika} me {datetime.now()}") df.columns = filtered_columns excel_path = join(xlsx_path, f"raw_{klinika}_2016-2019.xlsx") df.to_excel(excel_path, index=False) convert(Klinika.URGJENCA.value)