テスト記事3

import fitz  # PyMuPDF

import pandas as pd

from pathlib import Path

import win32com.client as win32

# =========================

# 設定

# =========================

PDF_PATH   = r”C:\Users\ishid\OneDrive\030_仕事\環境品質\110_Python\分析データ黒マスク\input\340000218438_ZT_SB_OX.pdf”

EXCEL_PATH = r”C:\Users\ishid\OneDrive\030_仕事\環境品質\110_Python\分析データ黒マスク\redaction_keywords.xlsx”   # 「キーワード」列

OUTPUT_DOC = r”C:\Users\ishid\OneDrive\030_仕事\環境品質\110_Python\分析データ黒マスク\output\340000218438_ZT_SB_OX_masked.docx”

DPI = 300

MARGIN_PT = 2.0

GLOBAL_SHIFT_X_PT = 0.0

GLOBAL_SHIFT_Y_PT = 0.0

# =========================

# Word 定数(数値)

# =========================

WD_PAGE_BREAK = 7        # wdPageBreak

WD_WRAP_FRONT = 6        # wdWrapFront (文字列の前面)

WD_RELATIVE_PAGE = 1     # wdRelative…Page

WD_COLLAPSE_END = 1      # wdCollapseEnd

# =========================

# 単位変換・ページ設定

# =========================

def mm_to_pt(mm: float) -> float:

    return mm * 72.0 / 25.4

TOP_MARGIN_PT    = mm_to_pt(25.4)

BOTTOM_MARGIN_PT = mm_to_pt(25.4)

SIDE_MARGIN_PT   = mm_to_pt(19.05)

HEADER_PT        = mm_to_pt(15.0)

FOOTER_PT        = mm_to_pt(15.0)

A4_WIDTH_PT  = mm_to_pt(210)

A4_HEIGHT_PT = mm_to_pt(297)

# =========================

# キーワード読み込み

# =========================

def load_keywords(excel_path: str) -> list[str]:

    df = pd.read_excel(excel_path)

    if “キーワード” not in df.columns:

        raise ValueError(“Excel に『キーワード』列がありません(列名: キーワード)。”)

    return (

        df[“キーワード”]

        .dropna()

        .astype(str)

        .str.strip()

        .tolist()

    )

# =========================

# 文書末尾Rangeを安全に取得

# =========================

def safe_end_range(doc):

    “””

    doc.Range(doc.Content.End, doc.Content.End) が落ちる環境があるため、

    範囲内に丸めて安全に末尾Rangeを作る。

    “””

    end_pos = doc.Content.End

    # end_pos-1 が負にならないように

    pos = max(0, end_pos – 1)

    return doc.Range(pos, pos)

# =========================

# Shape をページ基準で固定配置(Front)

# =========================

def set_shape_page_based(pic_shape, left_pos: float, top_pos: float, width: float, height: float, doc):

    pic_shape.WrapFormat.Type = WD_WRAP_FRONT

    try:

        pic_shape.RelativeHorizontalPosition = WD_RELATIVE_PAGE

        pic_shape.RelativeVerticalPosition = WD_RELATIVE_PAGE

    except Exception:

        pass

    try:

        pic_shape.LockAnchor = True

    except Exception:

        pass

    try:

        pic_shape.PictureFormat.CropLeft = 0

        pic_shape.PictureFormat.CropRight = 0

        pic_shape.PictureFormat.CropTop = 0

        pic_shape.PictureFormat.CropBottom = 0

    except Exception:

        pass

    pic_shape.LockAspectRatio = True

    pic_shape.Width = float(width)

    pic_shape.Height = float(height)

    pic_shape.Left = float(left_pos)

    pic_shape.Top = float(top_pos)

    try:

        doc.Application.ScreenRefresh()

    except Exception:

        pass

# =========================

# メイン

# =========================

def main():

    pdf_path = Path(PDF_PATH)

    keywords = load_keywords(EXCEL_PATH)

    print(“読み込んだキーワード:”)

    for k in keywords:

        print(”  -“, k)

    doc_pdf = fitz.open(str(pdf_path))

    word = win32.gencache.EnsureDispatch(“Word.Application”)

    word.Visible = True

    doc = word.Documents.Add()

    ps = doc.PageSetup

    ps.PageWidth = A4_WIDTH_PT

    ps.PageHeight = A4_HEIGHT_PT

    ps.TopMargin = TOP_MARGIN_PT

    ps.BottomMargin = BOTTOM_MARGIN_PT

    ps.LeftMargin = SIDE_MARGIN_PT

    ps.RightMargin = SIDE_MARGIN_PT

    ps.HeaderDistance = HEADER_PT

    ps.FooterDistance = FOOTER_PT

    usable_width = ps.PageWidth – ps.LeftMargin – ps.RightMargin

    usable_height = ps.PageHeight – ps.TopMargin – ps.BottomMargin

    for page_index in range(len(doc_pdf)):

        page_pdf = doc_pdf[page_index]

        pdf_rect = page_pdf.rect

        print(f”\n=== Page {page_index + 1} ===”)

        print(“PDF page size:”, pdf_rect.width, “x”, pdf_rect.height)

        # 改ページは「次ページへ行く前」

        if page_index > 0:

            end_rng = safe_end_range(doc)

            end_rng.Collapse(WD_COLLAPSE_END)

            end_rng.InsertBreak(WD_PAGE_BREAK)

        # アンカー用Range(安全に末尾から取得)

        anchor = safe_end_range(doc)

        anchor.Collapse(WD_COLLAPSE_END)

        anchor.InsertAfter(“\r”)

        anchor.Collapse(WD_COLLAPSE_END)

        # PDF → 画像

        zoom = DPI / 72.0

        mat = fitz.Matrix(zoom, zoom)

        pix = page_pdf.get_pixmap(matrix=mat)

        tmp_img = pdf_path.parent / f”_tmp_page_{page_index + 1}.png”

        pix.save(tmp_img)

        # Wordに貼付

        inline = doc.InlineShapes.AddPicture(

            FileName=str(tmp_img),

            LinkToFile=False,

            SaveWithDocument=True,

            Range=anchor

        )

        pic_shape = inline.ConvertToShape()

        # 余白内に最大フィット

        scale_x = usable_width / pdf_rect.width

        scale_y = usable_height / pdf_rect.height

        scale = min(scale_x, scale_y)

        img_width = pdf_rect.width * scale

        img_height = pdf_rect.height * scale

        left_pos = ps.LeftMargin + (usable_width – img_width) / 2

        top_pos  = ps.TopMargin  + (usable_height – img_height) / 2

        set_shape_page_based(pic_shape, left_pos, top_pos, img_width, img_height, doc)

        # 確定値を採用

        left_pos = float(pic_shape.Left)

        top_pos = float(pic_shape.Top)

        img_width = float(pic_shape.Width)

        img_height = float(pic_shape.Height)

        print(f”画像配置(確定): Left={left_pos:.1f}, Top={top_pos:.1f}, W={img_width:.1f}, H={img_height:.1f}”)

        sx = img_width / pdf_rect.width

        sy = img_height / pdf_rect.height

        # 黒マスク

        for kw in keywords:

            for rect in page_pdf.search_for(kw):

                rx0 = rect.x0 – MARGIN_PT

                ry0 = rect.y0 – MARGIN_PT

                rw = rect.width + 2 * MARGIN_PT

                rh = rect.height + 2 * MARGIN_PT

                wx0 = left_pos + rx0 * sx + GLOBAL_SHIFT_X_PT

                wy0 = top_pos + ry0 * sy + GLOBAL_SHIFT_Y_PT

                ww = rw * sx

                wh = rh * sy

                if ww <= 0 or wh <= 0:

                    continue

                shape = doc.Shapes.AddShape(

                    1, float(wx0), float(wy0), float(ww), float(wh),

                    pic_shape.Anchor

                )

                shape.Fill.ForeColor.RGB = 0x000000

                shape.Line.Visible = False

                shape.Name = “AUTO_MASK”

                try:

                    shape.WrapFormat.Type = WD_WRAP_FRONT

                except Exception:

                    pass

    doc.SaveAs(OUTPUT_DOC)

    print(“\n完成:”, OUTPUT_DOC)

if __name__ == “__main__”:

    main()

コメント

タイトルとURLをコピーしました