-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathextract_pdf_images.py
More file actions
179 lines (136 loc) · 6.91 KB
/
extract_pdf_images.py
File metadata and controls
179 lines (136 loc) · 6.91 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# Original idea from Hitcon 2022 web2pdf challenge: https://blog.splitline.tw/hitcon-ctf-2022/#%F0%9F%93%83-web2pdf-web
# Example usage: python extract_pdf_images.py Ticket-831767.pdf -v
import fitz # PyMuPDF
from PIL import Image
import base64
import zlib
import re
from argparse import ArgumentParser
parser = ArgumentParser(description="Extract file contents embedded in bitmap images in PDF file. File content may be plaintext or base64/zlib compressed. Extracted bitmap images and file contents are written to disk")
parser.add_argument('file', help='PDF file path')
parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output')
args = parser.parse_args()
PDF_FILE_PATH = args.file
VERBOSE = args.verbose
def decompress(data: bytes, chunk_size: int = 1024) -> bytes:
"""best effort zlib decompress. Return empty if not zlib compressed."""
decompressor = zlib.decompressobj(wbits=-15)
decompressed_output = b''
for i in range(0, len(data), chunk_size):
chunk = data[i:i + chunk_size]
try:
decompressed_chunk = decompressor.decompress(chunk)
decompressed_output += decompressed_chunk
except zlib.error as e:
if VERBOSE:
print(f"Zlib error encountered at chunk {i}: {e}. Stopping decompression.")
# Return the successfully decompressed data up to this point
return decompressed_output + decompressor.flush()
# Return the full output if no error was encountered
return decompressed_output + decompressor.flush()
def decodeb64(encoded_data: bytes, min_b64_output_bytes: int = 12) -> bytes:
"""
Best-effort Base64 decode.
If the total successful Base64 decoded output is less than
'min_b64_output_bytes', the function falls back to cleaning the entire
original input as printable plain text.
"""
# 1. Clean and initialize
encoded_data = encoded_data.strip()
decoded_output = b""
block_size = 4
# 2. Iterative Base64 Decoding Attempt
for i in range(0, len(encoded_data), block_size):
block = encoded_data[i:i + block_size]
try:
# base64.b64decode directly accepts the bytes block
decoded_block = base64.b64decode(block, validate=True)
decoded_output += decoded_block
except base64.binascii.Error as e:
# --- Base64 Decode Failed: Trigger Fallback/Partial Logic ---
# Check the total size of successfully decoded output bytes so far
if len(decoded_output) < min_b64_output_bytes:
if VERBOSE:
print(f"B64 decode failed after only {len(decoded_output)} bytes. Falling back to plain text.")
# Fallback triggered: We use the entire original input data
return _clean_unprintable_bytes(encoded_data)
# If the partial decode meets or exceeds the minimum threshold (N),
# we treat it as best-effort B64 and return the partial binary data.
if VERBOSE:
print(f"B64 decode failed after {len(decoded_output)} bytes (>= {min_b64_output_bytes}). Returning partial output.")
return decoded_output
# --- Success Case ---
# Loop completed without error. Now check the total length of the full output.
if len(decoded_output) < min_b64_output_bytes:
if VERBOSE:
print(f"Full B64 decode resulted in only {len(decoded_output)} bytes (< {min_b64_output_bytes}). Falling back to plain text.")
# Fallback triggered: Even though it decoded fully, the output was too short.
return _clean_unprintable_bytes(encoded_data)
# Full successful decode that meets the length requirement.
return decoded_output
# --- Helper Function for Plain Text Cleaning ---
def _clean_unprintable_bytes(data: bytes) -> bytes:
"""Decodes bytes to string and removes all non-printable ASCII characters."""
# Convert the raw bytes to a string.
# Using 'ascii' and 'ignore' errors to handle non-ASCII bytes gracefully.
text_input = data.decode('ascii', errors='ignore')
# Regex to find and remove non-standard printable characters.
# [^\x20-\x7E\n\r\t] matches anything outside the ASCII printable range.
RE_UNPRINTABLE = re.compile(r'[^\x20-\x7E\n\r\t]')
cleaned_text = RE_UNPRINTABLE.sub('', text_input)
# Return the cleaned string (type is 'str')
return cleaned_text.encode()
def extract_data(filename):
try:
with open(filename, 'rb') as f:
data = f.read()
marker = b'\x1b$)C'
data = data.partition(marker)[2].replace(b'\x00', b'')
b64_decoded_data = decodeb64(data)
decompressed_data = decompress(b64_decoded_data)
if decompressed_data:
data = decompressed_data
elif b64_decoded_data:
data = b64_decoded_data
if VERBOSE:
print(data)
if data:
extracted_filename = filename + '.extracted'
with open(extracted_filename, 'wb') as f2:
f2.write(data)
print(f'Wrote extracted data to: {extracted_filename}')
except Exception as e:
print(f'Unexpected error extracting data: {e}')
try:
pdf_file = fitz.open(PDF_FILE_PATH)
for page_index in range(len(pdf_file)):
page = pdf_file[page_index]
image_list = page.get_images(full=True)
if not image_list:
continue
if VERBOSE:
print(f"Found {len(image_list)} images on page {page_index + 1}")
for image_index, img in enumerate(image_list, start=1):
xref = img[0]
try:
# Use PyMuPDF's pixmap to get the raw image data
pix = fitz.Pixmap(pdf_file, xref)
# Check for alpha channel and convert to RGB if present, as BMP doesn't support it
if pix.alpha:
pix = fitz.Pixmap(fitz.csRGB, pix)
# Get the raw pixel data from the pixmap
image_data = pix.samples
# Create a Pillow image object from the raw pixel data
pil_image = Image.frombytes("RGB", [pix.width, pix.height], image_data)
# Define the filename and save using Pillow in the BMP format
image_filename = f"page{page_index+1}_img{image_index}.bmp"
pil_image.save(image_filename, "BMP")
if VERBOSE:
print(f"Saved original image as BMP: {image_filename}")
extracted_data = extract_data(image_filename)
except Exception as e:
print(f"Error processing image {image_index} on page {page_index + 1}: {e}")
print()
pdf_file.close()
except Exception as e:
print(f"An error occurred: {e}")