studimi_zheni/convert.py

import pandas as pd
from os import listdir
from os.path import isfile, join
from docx.api import Document
from enum import Enum
from datetime import datetime


class Klinika(Enum):
    KIRURGJI = 'kirurgji'
    OBS = 'OBS-GYN'
    URGJENCA = 'urgjenca'


def convert(klinika: str):
    print(f"Filloi {klinika} me {datetime.now()}")
    columns = []
    filtered_columns = []
    docx_path = './Word/'
    xlsx_path = './Excel/'
    list_files = [f for f in listdir(docx_path) if isfile(join(docx_path, f))]
    list_paths = []
    df = pd.DataFrame()
    for file in list_files:
        if klinika in file and "lock" not in file:
            list_paths.append(join(docx_path, file))
    for path in list_paths:
        document = Document(path)
        for table in document.tables:
            if "urgjenca_2016.docx" in path:
                grid = table._tbl.find("w:tblGrid", table._tbl.nsmap)
                for cell in table.column_cells(2):
                    cell._tc.getparent().remove(cell._tc)
                col_elem = grid[2]
                grid.remove(col_elem)
                print(f"U FSHI PAVIONI")

            for cell in table.rows[0].cells:
                columns.append(cell.text.strip().replace('\n', ' '))
            RowA = table.rows[0]
            table_element = table._tbl
            table_element.remove(RowA._tr)
            for row in table.rows:
                text = [cell.text for cell in row.cells]
                df = df.append([text], ignore_index=True)
        print(f"Mbaroi {path} me {datetime.now()}")

    for word in columns:
        if word not in filtered_columns:
            filtered_columns.append(word)
    print(f'{len(filtered_columns)} Columns found')
    print('writing to excel...')
    print(f"Perfundoi {klinika} me {datetime.now()}")
    df.columns = filtered_columns
    excel_path = join(xlsx_path, f"raw_{klinika}_2016-2019.xlsx")
    df.to_excel(excel_path, index=False)


convert(Klinika.URGJENCA.value)