Files
DELOS-Aerospace/archive/read_docx.py
2026-04-03 18:27:01 -04:00

37 lines
1.2 KiB
Python

import zipfile
import xml.etree.ElementTree as ET
import sys
import os
def read_docx(file_path):
if not os.path.exists(file_path):
print(f"Error: {file_path} not found.")
return
try:
with zipfile.ZipFile(file_path, 'r') as docx:
# The main text content is in word/document.xml
xml_content = docx.read('word/document.xml')
tree = ET.fromstring(xml_content)
# Namespaces are important in docx XML
namespaces = {
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
}
texts = []
for paragraph in tree.findall('.//w:p', namespaces):
t_elements = paragraph.findall('.//w:t', namespaces)
if t_elements:
texts.append("".join([t.text for t in t_elements if t.text]))
print("\n".join(texts))
except Exception as e:
print(f"Error reading {file_path}: {e}")
if __name__ == "__main__":
if len(sys.argv) > 1:
read_docx(sys.argv[1])
else:
print("Usage: python read_docx.py <path_to_docx>")