new
This commit is contained in:
BIN
Delos Aerospace Master Business Plan.md
Normal file
BIN
Delos Aerospace Master Business Plan.md
Normal file
Binary file not shown.
36
read_docx.py
Normal file
36
read_docx.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
import zipfile
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
import sys
|
||||||
|
import os
|
||||||
|
|
||||||
|
def read_docx(file_path):
|
||||||
|
if not os.path.exists(file_path):
|
||||||
|
print(f"Error: {file_path} not found.")
|
||||||
|
return
|
||||||
|
|
||||||
|
try:
|
||||||
|
with zipfile.ZipFile(file_path, 'r') as docx:
|
||||||
|
# The main text content is in word/document.xml
|
||||||
|
xml_content = docx.read('word/document.xml')
|
||||||
|
tree = ET.fromstring(xml_content)
|
||||||
|
|
||||||
|
# Namespaces are important in docx XML
|
||||||
|
namespaces = {
|
||||||
|
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||||
|
}
|
||||||
|
|
||||||
|
texts = []
|
||||||
|
for paragraph in tree.findall('.//w:p', namespaces):
|
||||||
|
t_elements = paragraph.findall('.//w:t', namespaces)
|
||||||
|
if t_elements:
|
||||||
|
texts.append("".join([t.text for t in t_elements if t.text]))
|
||||||
|
|
||||||
|
print("\n".join(texts))
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error reading {file_path}: {e}")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
if len(sys.argv) > 1:
|
||||||
|
read_docx(sys.argv[1])
|
||||||
|
else:
|
||||||
|
print("Usage: python read_docx.py <path_to_docx>")
|
||||||
Reference in New Issue
Block a user