import fitz # PyMuPDF
import pandas as pd
from pathlib import Path
import win32com.client as win32
# =========================
# 設定
# =========================
PDF_PATH = r”C:\Users\ishid\OneDrive\030_仕事\環境品質\110_Python\分析データ黒マスク\input\340000218438_ZT_SB_OX.pdf”
EXCEL_PATH = r”C:\Users\ishid\OneDrive\030_仕事\環境品質\110_Python\分析データ黒マスク\redaction_keywords.xlsx” # 「キーワード」列
OUTPUT_DOC = r”C:\Users\ishid\OneDrive\030_仕事\環境品質\110_Python\分析データ黒マスク\output\340000218438_ZT_SB_OX_masked.docx”
DPI = 300
MARGIN_PT = 2.0
GLOBAL_SHIFT_X_PT = 0.0
GLOBAL_SHIFT_Y_PT = 0.0
# =========================
# Word 定数(数値)
# =========================
WD_PAGE_BREAK = 7 # wdPageBreak
WD_WRAP_FRONT = 6 # wdWrapFront (文字列の前面)
WD_RELATIVE_PAGE = 1 # wdRelative…Page
WD_COLLAPSE_END = 1 # wdCollapseEnd
# =========================
# 単位変換・ページ設定
# =========================
def mm_to_pt(mm: float) -> float:
return mm * 72.0 / 25.4
TOP_MARGIN_PT = mm_to_pt(25.4)
BOTTOM_MARGIN_PT = mm_to_pt(25.4)
SIDE_MARGIN_PT = mm_to_pt(19.05)
HEADER_PT = mm_to_pt(15.0)
FOOTER_PT = mm_to_pt(15.0)
A4_WIDTH_PT = mm_to_pt(210)
A4_HEIGHT_PT = mm_to_pt(297)
# =========================
# キーワード読み込み
# =========================
def load_keywords(excel_path: str) -> list[str]:
df = pd.read_excel(excel_path)
if “キーワード” not in df.columns:
raise ValueError(“Excel に『キーワード』列がありません(列名: キーワード)。”)
return (
df[“キーワード”]
.dropna()
.astype(str)
.str.strip()
.tolist()
)
# =========================
# 文書末尾Rangeを安全に取得
# =========================
def safe_end_range(doc):
“””
doc.Range(doc.Content.End, doc.Content.End) が落ちる環境があるため、
範囲内に丸めて安全に末尾Rangeを作る。
“””
end_pos = doc.Content.End
# end_pos-1 が負にならないように
pos = max(0, end_pos – 1)
return doc.Range(pos, pos)
# =========================
# Shape をページ基準で固定配置(Front)
# =========================
def set_shape_page_based(pic_shape, left_pos: float, top_pos: float, width: float, height: float, doc):
pic_shape.WrapFormat.Type = WD_WRAP_FRONT
try:
pic_shape.RelativeHorizontalPosition = WD_RELATIVE_PAGE
pic_shape.RelativeVerticalPosition = WD_RELATIVE_PAGE
except Exception:
pass
try:
pic_shape.LockAnchor = True
except Exception:
pass
try:
pic_shape.PictureFormat.CropLeft = 0
pic_shape.PictureFormat.CropRight = 0
pic_shape.PictureFormat.CropTop = 0
pic_shape.PictureFormat.CropBottom = 0
except Exception:
pass
pic_shape.LockAspectRatio = True
pic_shape.Width = float(width)
pic_shape.Height = float(height)
pic_shape.Left = float(left_pos)
pic_shape.Top = float(top_pos)
try:
doc.Application.ScreenRefresh()
except Exception:
pass
# =========================
# メイン
# =========================
def main():
pdf_path = Path(PDF_PATH)
keywords = load_keywords(EXCEL_PATH)
print(“読み込んだキーワード:”)
for k in keywords:
print(” -“, k)
doc_pdf = fitz.open(str(pdf_path))
word = win32.gencache.EnsureDispatch(“Word.Application”)
word.Visible = True
doc = word.Documents.Add()
ps = doc.PageSetup
ps.PageWidth = A4_WIDTH_PT
ps.PageHeight = A4_HEIGHT_PT
ps.TopMargin = TOP_MARGIN_PT
ps.BottomMargin = BOTTOM_MARGIN_PT
ps.LeftMargin = SIDE_MARGIN_PT
ps.RightMargin = SIDE_MARGIN_PT
ps.HeaderDistance = HEADER_PT
ps.FooterDistance = FOOTER_PT
usable_width = ps.PageWidth – ps.LeftMargin – ps.RightMargin
usable_height = ps.PageHeight – ps.TopMargin – ps.BottomMargin
for page_index in range(len(doc_pdf)):
page_pdf = doc_pdf[page_index]
pdf_rect = page_pdf.rect
print(f”\n=== Page {page_index + 1} ===”)
print(“PDF page size:”, pdf_rect.width, “x”, pdf_rect.height)
# 改ページは「次ページへ行く前」
if page_index > 0:
end_rng = safe_end_range(doc)
end_rng.Collapse(WD_COLLAPSE_END)
end_rng.InsertBreak(WD_PAGE_BREAK)
# アンカー用Range(安全に末尾から取得)
anchor = safe_end_range(doc)
anchor.Collapse(WD_COLLAPSE_END)
anchor.InsertAfter(“\r”)
anchor.Collapse(WD_COLLAPSE_END)
# PDF → 画像
zoom = DPI / 72.0
mat = fitz.Matrix(zoom, zoom)
pix = page_pdf.get_pixmap(matrix=mat)
tmp_img = pdf_path.parent / f”_tmp_page_{page_index + 1}.png”
pix.save(tmp_img)
# Wordに貼付
inline = doc.InlineShapes.AddPicture(
FileName=str(tmp_img),
LinkToFile=False,
SaveWithDocument=True,
Range=anchor
)
pic_shape = inline.ConvertToShape()
# 余白内に最大フィット
scale_x = usable_width / pdf_rect.width
scale_y = usable_height / pdf_rect.height
scale = min(scale_x, scale_y)
img_width = pdf_rect.width * scale
img_height = pdf_rect.height * scale
left_pos = ps.LeftMargin + (usable_width – img_width) / 2
top_pos = ps.TopMargin + (usable_height – img_height) / 2
set_shape_page_based(pic_shape, left_pos, top_pos, img_width, img_height, doc)
# 確定値を採用
left_pos = float(pic_shape.Left)
top_pos = float(pic_shape.Top)
img_width = float(pic_shape.Width)
img_height = float(pic_shape.Height)
print(f”画像配置(確定): Left={left_pos:.1f}, Top={top_pos:.1f}, W={img_width:.1f}, H={img_height:.1f}”)
sx = img_width / pdf_rect.width
sy = img_height / pdf_rect.height
# 黒マスク
for kw in keywords:
for rect in page_pdf.search_for(kw):
rx0 = rect.x0 – MARGIN_PT
ry0 = rect.y0 – MARGIN_PT
rw = rect.width + 2 * MARGIN_PT
rh = rect.height + 2 * MARGIN_PT
wx0 = left_pos + rx0 * sx + GLOBAL_SHIFT_X_PT
wy0 = top_pos + ry0 * sy + GLOBAL_SHIFT_Y_PT
ww = rw * sx
wh = rh * sy
if ww <= 0 or wh <= 0:
continue
shape = doc.Shapes.AddShape(
1, float(wx0), float(wy0), float(ww), float(wh),
pic_shape.Anchor
)
shape.Fill.ForeColor.RGB = 0x000000
shape.Line.Visible = False
shape.Name = “AUTO_MASK”
try:
shape.WrapFormat.Type = WD_WRAP_FRONT
except Exception:
pass
doc.SaveAs(OUTPUT_DOC)
print(“\n完成:”, OUTPUT_DOC)
if __name__ == “__main__”:
main()



コメント