DELOS-Aerospace/archive/read_docx.py

import zipfile
import xml.etree.ElementTree as ET
import sys
import os

def read_docx(file_path):
    if not os.path.exists(file_path):
        print(f"Error: {file_path} not found.")
        return

    try:
        with zipfile.ZipFile(file_path, 'r') as docx:
            # The main text content is in word/document.xml
            xml_content = docx.read('word/document.xml')
            tree = ET.fromstring(xml_content)

            # Namespaces are important in docx XML
            namespaces = {
                'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
            }

            texts = []
            for paragraph in tree.findall('.//w:p', namespaces):
                t_elements = paragraph.findall('.//w:t', namespaces)
                if t_elements:
                    texts.append("".join([t.text for t in t_elements if t.text]))

            print("\n".join(texts))
    except Exception as e:
        print(f"Error reading {file_path}: {e}")

if __name__ == "__main__":
    if len(sys.argv) > 1:
        read_docx(sys.argv[1])
    else:
        print("Usage: python read_docx.py <path_to_docx>")