converted .docx to .md, added subfolders
This commit is contained in:
36
archive/read_docx.py
Normal file
36
archive/read_docx.py
Normal file
@@ -0,0 +1,36 @@
|
||||
import zipfile
|
||||
import xml.etree.ElementTree as ET
|
||||
import sys
|
||||
import os
|
||||
|
||||
def read_docx(file_path):
|
||||
if not os.path.exists(file_path):
|
||||
print(f"Error: {file_path} not found.")
|
||||
return
|
||||
|
||||
try:
|
||||
with zipfile.ZipFile(file_path, 'r') as docx:
|
||||
# The main text content is in word/document.xml
|
||||
xml_content = docx.read('word/document.xml')
|
||||
tree = ET.fromstring(xml_content)
|
||||
|
||||
# Namespaces are important in docx XML
|
||||
namespaces = {
|
||||
'w': 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
}
|
||||
|
||||
texts = []
|
||||
for paragraph in tree.findall('.//w:p', namespaces):
|
||||
t_elements = paragraph.findall('.//w:t', namespaces)
|
||||
if t_elements:
|
||||
texts.append("".join([t.text for t in t_elements if t.text]))
|
||||
|
||||
print("\n".join(texts))
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) > 1:
|
||||
read_docx(sys.argv[1])
|
||||
else:
|
||||
print("Usage: python read_docx.py <path_to_docx>")
|
||||
Reference in New Issue
Block a user