Python × lxml.etree：從 Word OOXML 讀、查、改、寫的實戰筆記; from lxml import etree

by 儲蓄保險王 · 2025-11-04

sample.docx 內容:

Python × lxml.etree：從 Word OOXML 讀、查、改、寫的實戰筆記; from lxml import etree - 儲蓄保險王

將附檔名改為.zip後
document.xml內容:

第二段w:p內容:

概要

以路徑 D:\Temp\sample.zip\word\document.xml 為例
教你用 lxml.etree 完成：讀取、XPath 查找、修改、刪除與回寫 zip/docx
每個段落都附上可直接在 Jupyter 執行的程式碼區塊

準備

檔案結構：D:\Temp\sample.zip 內的 word/document.xml
若你的來源是 .docx，將 zip_path 改成 .docx 亦可（.docx 本質是 OpenXML 壓縮包）

一、安裝與基本讀取

# 如果未安裝 lxml，先執行這行（在 Jupyter 內可直接跑）
# %pip install lxml

from pathlib import Path
import zipfile
from lxml import etree

# 指定你的路徑
zip_path = Path(r"D:\Temp\sample.zip")
member = "word/document.xml"

# 讀出 XML 字串
with zipfile.ZipFile(zip_path, "r") as zf:
    xml = zf.read(member).decode("utf-8", errors="replace")

# 解析
parser = etree.XMLParser(remove_blank_text=False, recover=True, huge_tree=True)
root = etree.fromstring(xml.encode("utf-8"), parser=parser)

# 常用命名空間（OOXML）
ns = {
    "w":   "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "a":   "http://schemas.openxmlformats.org/drawingml/2006/main",
    "r":   "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "wp":  "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
    "w14": "http://schemas.microsoft.com/office/word/2010/wordml",
}

# 預覽文件有多少段落
paras = root.xpath(".//w:p", namespaces=ns)
print("段落數：", len(paras))

# 印出第一段（若存在）
if paras:
    print(etree.tostring(paras[0], encoding="unicode", pretty_print=True))

# 如果未安裝 lxml，先執行這行（在 Jupyter 內可直接跑）
# %pip install lxml

from pathlib import Path
import zipfile
from lxml import etree

# 指定你的路徑
zip_path = Path(r"D:\Temp\sample.zip")
member = "word/document.xml"

# 讀出 XML 字串
with zipfile.ZipFile(zip_path, "r") as zf:
    xml = zf.read(member).decode("utf-8", errors="replace")

# 解析
parser = etree.XMLParser(remove_blank_text=False, recover=True, huge_tree=True)
root = etree.fromstring(xml.encode("utf-8"), parser=parser)

# 常用命名空間（OOXML）
ns = {
    "w":   "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "a":   "http://schemas.openxmlformats.org/drawingml/2006/main",
    "r":   "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "wp":  "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
    "w14": "http://schemas.microsoft.com/office/word/2010/wordml",
}

# 預覽文件有多少段落
paras = root.xpath(".//w:p", namespaces=ns)
print("段落數：", len(paras))

# 印出第一段（若存在）
if paras:
    print(etree.tostring(paras[0], encoding="unicode", pretty_print=True))

輸出:

parser :

二、用 XPath 精準查找：找出含特定圖片 rId 的段落

# 想查找的關聯 ID（例如 rId4）
rid = "rId4"

# 1) 查找外層段落 w:p，條件：內部有 a:blip 的 r:embed 或 r:link 命中
xp = ".//w:p[.//a:blip[@r:embed=$rid or @r:link=$rid]]"
hits = root.xpath(xp, namespaces=ns, rid=rid)
print("命中段落數：", len(hits))

# 2) 顯示第一個命中的 paraId 與完整 XML
if hits:
    p = hits[0]
    paraId = p.get(f"{{{ns['w14']}}}paraId")
    print("命中 paraId：", paraId)
    print(etree.tostring(p, encoding="unicode", pretty_print=True))

# 3) 也可以直接列出文檔內所有圖片 rId
rids = root.xpath("//a:blip/@r:embed | //a:blip/@r:link", namespaces=ns)
print("文檔中出現的 rId 一覽：", sorted(set(rids)))

# 想查找的關聯 ID（例如 rId4）
rid = "rId4"

# 1) 查找外層段落 w:p，條件：內部有 a:blip 的 r:embed 或 r:link 命中
xp = ".//w:p[.//a:blip[@r:embed=$rid or @r:link=$rid]]"
hits = root.xpath(xp, namespaces=ns, rid=rid)
print("命中段落數：", len(hits))

# 2) 顯示第一個命中的 paraId 與完整 XML
if hits:
    p = hits[0]
    paraId = p.get(f"{{{ns['w14']}}}paraId")
    print("命中 paraId：", paraId)
    print(etree.tostring(p, encoding="unicode", pretty_print=True))

# 3) 也可以直接列出文檔內所有圖片 rId
rids = root.xpath("//a:blip/@r:embed | //a:blip/@r:link", namespaces=ns)
print("文檔中出現的 rId 一覽：", sorted(set(rids)))

輸出:

三、只刪除圖片，保留段落與文字

# 從文檔中移除特定 rId 對應的圖片（刪最接近的 wp:inline 或 w:drawing）
removed = 0
for blip in root.xpath(".//a:blip[@r:embed=$rid or @r:link=$rid]", namespaces=ns, rid=rid):
    node = blip
    # 往上走，找到可整塊移除的容器
    while node is not None and node.tag not in {f"{{{ns['wp']}}}inline", f"{{{ns['w']}}}drawing"}:
        node = node.getparent()
    if node is not None and node.getparent() is not None:
        node.getparent().remove(node)
        removed += 1

print("已移除圖片數：", removed)

# 從文檔中移除特定 rId 對應的圖片（刪最接近的 wp:inline 或 w:drawing）
removed = 0
for blip in root.xpath(".//a:blip[@r:embed=$rid or @r:link=$rid]", namespaces=ns, rid=rid):
    node = blip
    # 往上走，找到可整塊移除的容器
    while node is not None and node.tag not in {f"{{{ns['wp']}}}inline", f"{{{ns['w']}}}drawing"}:
        node = node.getparent()
    if node is not None and node.getparent() is not None:
        node.getparent().remove(node)
        removed += 1

print("已移除圖片數：", removed)

輸出:

四、刪除整個命中段落

# %pip install lxml
from pathlib import Path
import zipfile
from lxml import etree

zip_path = Path(r"D:\Temp\sample.zip")      # 改成你的檔
member = "word/document.xml"
rid_to_remove = "rId4"                       # 想刪的 rId

# 讀與解析
with zipfile.ZipFile(zip_path, "r") as zf:
    xml = zf.read(member).decode("utf-8", errors="replace")
parser = etree.XMLParser(recover=True, huge_tree=True)
root = etree.fromstring(xml.encode("utf-8"), parser=parser)

ns = {
    "w":   "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "a":   "http://schemas.openxmlformats.org/drawingml/2006/main",
    "r":   "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "wp":  "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
    "w14": "http://schemas.microsoft.com/office/word/2010/wordml",
}

# 1) 先列出文檔內的所有 rId，確認目標值存在
all_rids = root.xpath("//a:blip/@r:embed | //a:blip/@r:link", namespaces=ns)
print("所有 rId：", sorted(set(all_rids)))

# 2) 同時嘗試兩個 XPath（等價但有時一個更穩）
xp1 = ".//w:p[.//a:blip[@r:embed=$rid or @r:link=$rid]]"
xp2 = ".//w:p[.//pic:blipFill//a:blip[@r:embed=$rid or @r:link=$rid]]"

hits1 = root.xpath(xp1, namespaces=ns, rid=rid_to_remove)
hits2 = root.xpath(xp2, namespaces=ns, rid=rid_to_remove)
print("xp1 命中段落數：", len(hits1))
print("xp2 命中段落數：", len(hits2))

# 3) 真正移除（用命中的集合做）
hits = hits1 or hits2
for p in hits:
    parent = p.getparent()
    if parent is not None:
        parent.remove(p)

print("刪除段落數：", len(hits))

# 若需要，寫回新檔驗證
if hits:
    new_xml = etree.tostring(root, encoding="unicode")
    out = Path(r"D:\Temp\sample_removed.docx")
    with zipfile.ZipFile(zip_path, "r") as zin, zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
        for item in zin.infolist():
            data = zin.read(item.filename)
            if item.filename == member:
                data = new_xml.encode("utf-8")
            zout.writestr(item, data)
    print("已輸出：", out)

# %pip install lxml
from pathlib import Path
import zipfile
from lxml import etree

zip_path = Path(r"D:\Temp\sample.zip")      # 改成你的檔
member = "word/document.xml"
rid_to_remove = "rId4"                       # 想刪的 rId

# 讀與解析
with zipfile.ZipFile(zip_path, "r") as zf:
    xml = zf.read(member).decode("utf-8", errors="replace")
parser = etree.XMLParser(recover=True, huge_tree=True)
root = etree.fromstring(xml.encode("utf-8"), parser=parser)

ns = {
    "w":   "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
    "a":   "http://schemas.openxmlformats.org/drawingml/2006/main",
    "r":   "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
    "wp":  "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
    "pic": "http://schemas.openxmlformats.org/drawingml/2006/picture",
    "w14": "http://schemas.microsoft.com/office/word/2010/wordml",
}

# 1) 先列出文檔內的所有 rId，確認目標值存在
all_rids = root.xpath("//a:blip/@r:embed | //a:blip/@r:link", namespaces=ns)
print("所有 rId：", sorted(set(all_rids)))

# 2) 同時嘗試兩個 XPath（等價但有時一個更穩）
xp1 = ".//w:p[.//a:blip[@r:embed=$rid or @r:link=$rid]]"
xp2 = ".//w:p[.//pic:blipFill//a:blip[@r:embed=$rid or @r:link=$rid]]"

hits1 = root.xpath(xp1, namespaces=ns, rid=rid_to_remove)
hits2 = root.xpath(xp2, namespaces=ns, rid=rid_to_remove)
print("xp1 命中段落數：", len(hits1))
print("xp2 命中段落數：", len(hits2))

# 3) 真正移除（用命中的集合做）
hits = hits1 or hits2
for p in hits:
    parent = p.getparent()
    if parent is not None:
        parent.remove(p)

print("刪除段落數：", len(hits))

# 若需要，寫回新檔驗證
if hits:
    new_xml = etree.tostring(root, encoding="unicode")
    out = Path(r"D:\Temp\sample_removed.docx")
    with zipfile.ZipFile(zip_path, "r") as zin, zipfile.ZipFile(out, "w", zipfile.ZIP_DEFLATED) as zout:
        for item in zin.infolist():
            data = zin.read(item.filename)
            if item.filename == member:
                data = new_xml.encode("utf-8")
            zout.writestr(item, data)
    print("已輸出：", out)

常用語法速記

child: a/b/c 直系子節點
descendant: .//a 任意深度後代
parent: ../ 回到父節點
ancestor::w:p 往上找祖先 w:p
predicate: w:p[w:r] 段落中含 w:r
attribute: //@attr 任何節點上的 attr
or/and: [@r:embed=$rid or @r:link=$rid]

在 XPath 裡：

@ 是「屬性」選擇器
- @r:embed 指的是節點上的名為 r:embed 的屬性
- @r:link 指的是節點上的名為 r:link 的屬性
- 例如在 <a:blip r:embed=”rId4″/>，@r:embed 的值就是 “rId4”
$ 是「變數」引用（由你的 XPath 執行端提供）
- $rid 代表一個外部傳入的變數 rid
- 在 lxml 中會這樣呼叫：
  root.xpath(“//a:blip[@r:embed=$rid or @r:link=$rid]”, namespaces=ns, rid=”rId4″)
  這表示把字串 “rId4” 綁定到 XPath 裡的變數 $rid

所以整句的意思是：選出所有 a:blip 節點，且該節點的屬性 r:embed 或 r:link 的值等於變數 $rid。

輸出:

五、寫回成新的 zip 或 docx

def write_back(orig_zip: Path, out_zip: Path, new_xml_str: str):
    """把修改過的 document.xml 寫回新的壓縮包（zip/docx 皆可）。"""
    with zipfile.ZipFile(orig_zip, "r") as zin, zipfile.ZipFile(out_zip, "w", zipfile.ZIP_DEFLATED) as zout:
        for item in zin.infolist():
            data = zin.read(item.filename)
            if item.filename == "word/document.xml":
                data = new_xml_str.encode("utf-8")
            zout.writestr(item, data)

# 產生新檔（副檔名可用 .docx）
out_path = Path(r"D:\Temp\sample_modified.docx")
new_xml = etree.tostring(root, encoding="unicode")
write_back(zip_path, out_path, new_xml)
print("已寫出：", out_path)

def write_back(orig_zip: Path, out_zip: Path, new_xml_str: str):
    """把修改過的 document.xml 寫回新的壓縮包（zip/docx 皆可）。"""
    with zipfile.ZipFile(orig_zip, "r") as zin, zipfile.ZipFile(out_zip, "w", zipfile.ZIP_DEFLATED) as zout:
        for item in zin.infolist():
            data = zin.read(item.filename)
            if item.filename == "word/document.xml":
                data = new_xml_str.encode("utf-8")
            zout.writestr(item, data)

# 產生新檔（副檔名可用 .docx）
out_path = Path(r"D:\Temp\sample_modified.docx")
new_xml = etree.tostring(root, encoding="unicode")
write_back(zip_path, out_path, new_xml)
print("已寫出：", out_path)

sample_modified.docx

六、快速驗證輸出

# 讀回剛輸出的檔案檢查是否仍存在指定 rId
def verify(zip_file: Path, rid: str):
    with zipfile.ZipFile(zip_file, "r") as zf:
        xml2 = zf.read("word/document.xml").decode("utf-8", errors="replace")
    root_v = etree.fromstring(xml2.encode("utf-8"), parser=etree.XMLParser(recover=True, huge_tree=True))
    ns_v = {
        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
        "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
        "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
        "pic":"http://schemas.openxmlformats.org/drawingml/2006/picture",
    }
    n = root_v.xpath("count(.//a:blip[@r:embed=$rid or @r:link=$rid])", namespaces=ns_v, rid=rid)
    print(f"{zip_file.name} 中 rId={rid} 的圖片數：", int(n))

verify(out_path, "rId4")

# 讀回剛輸出的檔案檢查是否仍存在指定 rId
def verify(zip_file: Path, rid: str):
    with zipfile.ZipFile(zip_file, "r") as zf:
        xml2 = zf.read("word/document.xml").decode("utf-8", errors="replace")
    root_v = etree.fromstring(xml2.encode("utf-8"), parser=etree.XMLParser(recover=True, huge_tree=True))
    ns_v = {
        "w": "http://schemas.openxmlformats.org/wordprocessingml/2006/main",
        "a": "http://schemas.openxmlformats.org/drawingml/2006/main",
        "r": "http://schemas.openxmlformats.org/officeDocument/2006/relationships",
        "wp": "http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing",
        "pic":"http://schemas.openxmlformats.org/drawingml/2006/picture",
    }
    n = root_v.xpath("count(.//a:blip[@r:embed=$rid or @r:link=$rid])", namespaces=ns_v, rid=rid)
    print(f"{zip_file.name} 中 rId={rid} 的圖片數：", int(n))

verify(out_path, "rId4")

輸出:

七、常見錯誤與速解

找不到節點：多半是命名空間 URI 沒對。前綴名稱可自訂，但 URI 必須精準一致。
解析錯誤：對不乾淨的 XML 使用 recover=True；對大型檔案加 huge_tree=True。
Edge/文字編輯器看不到重點：瀏覽器只顯示，不會做 XPath；請用 lxml 驗證。

八、延伸操作範例

取得所有段落的 w14:paraId

para_ids = [p.get(f"{{{ns['w14']}}}paraId") for p in root.xpath(".//w:p", namespaces=ns)]
print([pid for pid in para_ids if pid])

para_ids = [p.get(f"{{{ns['w14']}}}paraId") for p in root.xpath(".//w:p", namespaces=ns)]
print([pid for pid in para_ids if pid])

輸出:

結語

以 lxml.etree 操作 OOXML，核心在「正確宣告命名空間」與「用 XPath 自內而外定位」。
以上每段程式都能直接在 Jupyter 貼上執行，從讀取、查找，到修改與寫回完整打通。

一	二	三	四	五	六	日
					1	2
3	4	5	6	7	8	9
10	11	12	13	14	15	16
17	18	19	20	21	22	23
24	25	26	27	28	29	30

Python × lxml.etree：從 Word OOXML 讀、查、改、寫的實戰筆記; from lxml import etree

You may also like...

發佈留言取消回覆

hahow

近期文章

分類

近期留言

熱門討論

FB粉絲團

瀏覽量

月曆

Python × lxml.etree：從 Word OOXML 讀、查、改、寫的實戰筆記; from lxml import etree

You may also like...

Python: 二進制的負數如何表示？補數

Python: 如何利用dict.pop() 對字典重新排序? dict.pop() #刪除key,回傳value

一文搞懂Python pandas.DataFrame去重：df.drop_duplicates() 與 df[~df.duplicated()] 的等價、差異與最佳實踐

發佈留言 取消回覆

hahow

近期文章

分類

近期留言

熱門討論

FB粉絲團

瀏覽量

月曆

發佈留言取消回覆