37 lines
1.2 KiB
Python
37 lines
1.2 KiB
Python
import zipfile
|
|
import xml.etree.ElementTree as ET
|
|
import sys
|
|
import os
|
|
|
|
def read_docx(file_path):
|
|
if not os.path.exists(file_path):
|
|
print(f"Error: {file_path} not found.")
|
|
return
|
|
|
|
try:
|
|
with zipfile.ZipFile(file_path, 'r') as docx:
|
|
# The main text content is in word/document.xml
|
|
xml_content = docx.read('word/document.xml')
|
|
tree = ET.fromstring(xml_content)
|
|
|
|
# Namespaces are important in docx XML
|
|
namespaces = {
|
|
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
|
}
|
|
|
|
texts = []
|
|
for paragraph in tree.findall('.//w:p', namespaces):
|
|
t_elements = paragraph.findall('.//w:t', namespaces)
|
|
if t_elements:
|
|
texts.append("".join([t.text for t in t_elements if t.text]))
|
|
|
|
print("\n".join(texts))
|
|
except Exception as e:
|
|
print(f"Error reading {file_path}: {e}")
|
|
|
|
if __name__ == "__main__":
|
|
if len(sys.argv) > 1:
|
|
read_docx(sys.argv[1])
|
|
else:
|
|
print("Usage: python read_docx.py <path_to_docx>")
|