""" A script that extracts shellcode from PDF files The script uses very basic shellcode extraction algorithm Copyright (c) 1990-2021 Hex-Rays ALL RIGHTS RESERVED. Revision history ========================= v1.0 - initial version Possible enhancements: ========================= 1. From Didier: ----------------- FYI: the regex you use to match /JavaScript /JS will fail to match name obfuscation. Name obuscation use a feature of the PDF language that allows a character in a name (like /JavaScript) to be replaced with its hexcode. Example: /#4AavaScript http://blog.didierstevens.com/2008/04/29/pdf-let-me-count-the-ways/ It's something that's used in-the-wild. I've updated your regex to support name obfuscation. The JavaScript itself is now captured in group 13. \/S\s*\/(J|#4A|#4a)(a|#61)(v|#76)(a|#61)(S|#53)(c|#63)(r|#72)(i|#69)(p|#70)(t|#74)\s*\/(J|#4A|#4a)(S|#53) \((.+?)>> 2. --------------- """ import sys import re import zlib SAMPLE1 = 'malware1.pdf.vir' SAMPLE2 = 'heapspray-simpler-calc.pdf.vir' try: import idaapi import idc import ida_idp ida = True except: ida = False # ----------------------------------------------------------------------- # Tries to find shellcode inside JavaScript statements # The seach algorithm is simple: it searchs for anything between unescape() # if it encounters %u or %x it correctly decodes them to characters def extract_shellcode(lines): p = 0 shellcode = [] # accumulate shellcode while True: p = lines.find(b'unescape("', p) if p == -1: break e = lines.find(b')', p) if e == -1: break expr = lines[p+9:e] data = [] def put_byte(b): if sys.version_info.major >= 3: data.append(b) else: data.append(chr(b)) for i in range(0, len(expr)): if expr[i:i+2] == b"%u": i += 2 put_byte(int(expr[i+2:i+4], 16)) put_byte(int(expr[i:i+2], 16)) i += 4 elif expr[i] == b"%": i += 1 put_byte(int(expr[i:i+2], 16)) i += 2 # advance the match pos p += 8 if sys.version_info.major >= 3: shellcode.append(bytes(data)) else: shellcode.append("".join(data)) # That's it return shellcode # ----------------------------------------------------------------------- # Given a PDF object id and version, we return the object declaration def find_obj(buf, id, ver): stream = re.search(b'%d %d obj(.*?)endobj' % (id, ver), buf, re.MULTILINE | re.DOTALL) if not stream: return None return buf[stream.start(1):stream.end(1)] # ----------------------------------------------------------------------- # Find JavaScript objects and extract the referenced script object id/ver def find_js_ref_streams(buf): o = [] js_ref_streams = re.finditer(r'\/S\s*\/JavaScript\/JS (\d+) (\d+) R'.encode("UTF-8"), buf) for g in js_ref_streams: id = int(g.group(1)) ver = int(g.group(2)) o.append([id, ver]) return o # ----------------------------------------------------------------------- # Find JavaScript objects and extract the embedded script def find_embedded_js(buf): r = re.finditer(r'\/S\s*\/JavaScript\s*\/JS \((.+?)>>'.encode("UTF-8"), buf, re.MULTILINE | re.DOTALL) if not r: return None ret = [] for js in r: p = buf.rfind(b'obj', 0, js.start(1)) if p == -1: return None scs = extract_shellcode(js.group(1)) if not scs: return None t = buf[p - min(20, len(buf)): p + 3] obj = re.search('(\d+) (\d+) obj'.encode("UTF-8"), t) if not obj: id, ver = 0 else: id = int(obj.group(1)) ver = int(obj.group(2)) n = 0 for sc in scs: n += 1 ret.append([id, ver, n, sc]) return ret # ----------------------------------------------------------------------- # Given a gzipped stream object, it returns the decompressed contents def decompress_stream(buf): if buf.find(b'Filter[/FlateDecode]') == -1: return None m = re.search(b'stream\s*(.+?)\s*endstream', buf, re.DOTALL | re.MULTILINE) if not m: return None # Decompress and return return zlib.decompress(m.group(1)) # ----------------------------------------------------------------------- def read_whole_file(li): li.seek(0) return li.read(li.size()) # ----------------------------------------------------------------------- def extract_pdf_shellcode(buf): ret = [] # find all JS stream references r = find_js_ref_streams(buf) for id, ver in r: # extract the JS stream object obj = find_obj(buf, id, ver) # decode the stream stream = decompress_stream(obj) # extract shell code scs = extract_shellcode(stream) i = 0 for sc in scs: i += 1 ret.append([id, ver, i, sc]) # find all embedded JS r = find_embedded_js(buf) if r: ret.extend(r) return ret # ----------------------------------------------------------------------- def accept_file(li, filename): """ Check if the file is of supported format @param li: a file-like object which can be used to access the input data @param filename: name of the file, if it is an archive member name then the actual file doesn't exist @return: 0 - no more supported formats string "name" - format name to display in the chooser dialog dictionary { 'format': "name", 'options': integer } options: should be 1, possibly ORed with ACCEPT_FIRST (0x8000) to indicate preferred format """ # we support only one format per file li.seek(0) if li.read(5) != b'%PDF-': return 0 buf = read_whole_file(li) r = extract_pdf_shellcode(buf) if not r: return 0 return {'format': 'PDF with shellcode', 'processor': 'metapc'} # ----------------------------------------------------------------------- def load_file(li, neflags, format): """ Load the file into database @param li: a file-like object which can be used to access the input data @param neflags: options selected by the user, see loader.hpp @return: 0-failure, 1-ok """ # Select the PC processor module idaapi.set_processor_type("metapc", ida_idp.SETPROC_LOADER) buf = read_whole_file(li) r = extract_pdf_shellcode(buf) if not r: return 0 # Load all shellcode into different segments start = 0x10000 seg = idaapi.segment_t() for id, ver, n, sc in r: size = len(sc) end = start + size # Create the segment seg.start_ea = start seg.end_ea = end seg.bitness = 1 # 32-bit idaapi.add_segm_ex(seg, "obj_%d_%d_%d" % (id, ver, n), "CODE", 0) # Copy the bytes idaapi.mem2base(sc, start, end) # Mark for analysis idc.AutoMark(start, idc.AU_CODE) # Compute next loading address start = ((end // 0x1000) + 1) * 0x1000 # Select the bochs debugger idc.load_debugger("bochs", 0) return 1 # ----------------------------------------------------------------------- def test1(sample = SAMPLE1): # open the file f = file(sample, 'rb') buf = f.read() f.close() # find all JS stream references r = find_js_ref_streams(buf) if not r: return for id, ver in r: obj = find_obj(buf, id, ver) # extract the JS stream object f = file('obj_%d_%d.bin' % (id, ver), 'wb') f.write(obj) f.close() # decode the stream stream = decompress_stream(obj) f = file('dec_%d_%d.bin' % (id, ver), 'wb') f.write(stream) f.close() # extract shell code scs = extract_shellcode(stream) i = 0 for sc in scs: i += 1 f = file('sh_%d_%d_%d.bin' % (id, ver, i), 'wb') f.write(sc) f.close() # ----------------------------------------------------------------------- def test2(sample = SAMPLE1): # open the file f = file(sample, 'rb') buf = f.read() f.close() r = extract_pdf_shellcode(buf) for id, ver, n, sc in r: print("sc %d.%d[%d]=%d" % (id, ver, n, len(sc))) # ----------------------------------------------------------------------- def test3(sample = SAMPLE2): # open the file f = file(sample, 'rb') buf = f.read() f.close() t = find_embedded_js(buf) print(t) # ----------------------------------------------------------------------- def main(): test1(SAMPLE1) if not ida: main()