update to ida 7.6, add builds

2021-10-31 21:20:46 +02:00
parent e0e0f2be99
commit b1809fe2d9
1408 changed files with 279193 additions and 302468 deletions
--- a/idasdk76/ldr/script_ldrs/pdfldr.py
+++ b/idasdk76/ldr/script_ldrs/pdfldr.py
@@ -0,0 +1,319 @@
+"""
+A script that extracts shellcode from PDF files
+
+The script uses very basic shellcode extraction algorithm
+
+Copyright (c) 1990-2021 Hex-Rays
+ALL RIGHTS RESERVED.
+
+Revision history
+=========================
+v1.0 - initial version
+
+
+Possible enhancements:
+=========================
+1. From Didier:
+-----------------
+FYI: the regex you use to match /JavaScript /JS will fail to match
+name obfuscation. Name obuscation use a feature of the PDF language
+that allows a character in a name (like /JavaScript) to be replaced
+with its hexcode. Example: /#4AavaScript
+http://blog.didierstevens.com/2008/04/29/pdf-let-me-count-the-ways/
+
+It's something that's used in-the-wild.
+
+I've updated your regex to support name obfuscation. The JavaScript
+itself is now captured in group 13.
+
+\/S\s*\/(J|#4A|#4a)(a|#61)(v|#76)(a|#61)(S|#53)(c|#63)(r|#72)(i|#69)(p|#70)(t|#74)\s*\/(J|#4A|#4a)(S|#53)
+\((.+?)>>
+
+2.
+---------------
+
+"""
+
+import sys
+import re
+import zlib
+
+SAMPLE1 = 'malware1.pdf.vir'
+SAMPLE2 = 'heapspray-simpler-calc.pdf.vir'
+
+try:
+    import idaapi
+    import idc
+    import ida_idp
+    ida = True
+except:
+    ida = False
+
+# -----------------------------------------------------------------------
+# Tries to find shellcode inside JavaScript statements
+# The seach algorithm is simple: it searchs for anything between unescape()
+# if it encounters %u or %x it correctly decodes them to characters
+def extract_shellcode(lines):
+    p = 0
+    shellcode = [] # accumulate shellcode
+    while True:
+        p = lines.find(b'unescape("', p)
+        if p == -1:
+            break
+        e = lines.find(b')', p)
+        if e == -1:
+            break
+        expr = lines[p+9:e]
+        data = []
+        def put_byte(b):
+            if sys.version_info.major >= 3:
+                data.append(b)
+            else:
+                data.append(chr(b))
+
+        for i in range(0, len(expr)):
+            if expr[i:i+2] == b"%u":
+                i += 2
+                put_byte(int(expr[i+2:i+4], 16))
+                put_byte(int(expr[i:i+2], 16))
+                i += 4
+            elif expr[i] == b"%":
+                i += 1
+                put_byte(int(expr[i:i+2], 16))
+                i += 2
+        # advance the match pos
+        p += 8
+        if sys.version_info.major >= 3:
+            shellcode.append(bytes(data))
+        else:
+            shellcode.append("".join(data))
+
+    # That's it
+    return shellcode
+
+# -----------------------------------------------------------------------
+# Given a PDF object id and version, we return the object declaration
+def find_obj(buf, id, ver):
+    stream = re.search(b'%d %d obj(.*?)endobj' % (id, ver), buf, re.MULTILINE | re.DOTALL)
+    if not stream:
+        return None
+    return buf[stream.start(1):stream.end(1)]
+
+# -----------------------------------------------------------------------
+# Find JavaScript objects and extract the referenced script object id/ver
+def find_js_ref_streams(buf):
+    o = []
+    js_ref_streams = re.finditer(r'\/S\s*\/JavaScript\/JS (\d+) (\d+) R'.encode("UTF-8"), buf)
+    for g in js_ref_streams:
+        id = int(g.group(1))
+        ver = int(g.group(2))
+        o.append([id, ver])
+    return o
+
+# -----------------------------------------------------------------------
+# Find JavaScript objects and extract the embedded script
+def find_embedded_js(buf):
+    r = re.finditer(r'\/S\s*\/JavaScript\s*\/JS \((.+?)>>'.encode("UTF-8"), buf, re.MULTILINE | re.DOTALL)
+    if not r:
+        return None
+
+    ret = []
+    for js in r:
+        p = buf.rfind(b'obj', 0, js.start(1))
+        if p == -1:
+            return None
+
+        scs = extract_shellcode(js.group(1))
+        if not scs:
+            return None
+
+        t = buf[p - min(20, len(buf)): p + 3]
+        obj = re.search('(\d+) (\d+) obj'.encode("UTF-8"), t)
+        if not obj:
+            id, ver = 0
+        else:
+            id = int(obj.group(1))
+            ver = int(obj.group(2))
+        n = 0
+        for sc in scs:
+            n += 1
+            ret.append([id, ver, n, sc])
+    return ret
+
+# -----------------------------------------------------------------------
+# Given a gzipped stream object, it returns the decompressed contents
+def decompress_stream(buf):
+    if buf.find(b'Filter[/FlateDecode]') == -1:
+        return None
+    m = re.search(b'stream\s*(.+?)\s*endstream', buf, re.DOTALL | re.MULTILINE)
+    if not m:
+        return None
+    # Decompress and return
+    return zlib.decompress(m.group(1))
+
+
+# -----------------------------------------------------------------------
+def read_whole_file(li):
+    li.seek(0)
+    return li.read(li.size())
+
+# -----------------------------------------------------------------------
+def extract_pdf_shellcode(buf):
+    ret = []
+
+    # find all JS stream references
+    r = find_js_ref_streams(buf)
+    for id, ver in r:
+        # extract the JS stream object
+        obj = find_obj(buf, id, ver)
+
+        # decode the stream
+        stream = decompress_stream(obj)
+
+        # extract shell code
+        scs = extract_shellcode(stream)
+        i = 0
+        for sc in scs:
+            i += 1
+            ret.append([id, ver, i, sc])
+
+    # find all embedded JS
+    r = find_embedded_js(buf)
+    if r:
+        ret.extend(r)
+
+    return ret
+
+# -----------------------------------------------------------------------
+def accept_file(li, filename):
+    """
+    Check if the file is of supported format
+
+    @param li: a file-like object which can be used to access the input data
+    @param filename: name of the file, if it is an archive member name then the actual file doesn't exist
+    @return: 0 - no more supported formats
+             string "name" - format name to display in the chooser dialog
+             dictionary { 'format': "name", 'options': integer }
+               options: should be 1, possibly ORed with ACCEPT_FIRST (0x8000)
+               to indicate preferred format
+    """
+
+    # we support only one format per file
+    li.seek(0)
+    if li.read(5) != b'%PDF-':
+        return 0
+
+    buf = read_whole_file(li)
+    r = extract_pdf_shellcode(buf)
+    if not r:
+        return 0
+
+    return {'format': 'PDF with shellcode', 'processor': 'metapc'}
+
+# -----------------------------------------------------------------------
+def load_file(li, neflags, format):
+
+    """
+    Load the file into database
+
+    @param li: a file-like object which can be used to access the input data
+    @param neflags: options selected by the user, see loader.hpp
+    @return: 0-failure, 1-ok
+    """
+
+    # Select the PC processor module
+    idaapi.set_processor_type("metapc", ida_idp.SETPROC_LOADER)
+
+    buf = read_whole_file(li)
+    r = extract_pdf_shellcode(buf)
+    if not r:
+        return 0
+
+    # Load all shellcode into different segments
+    start = 0x10000
+    seg = idaapi.segment_t()
+    for id, ver, n, sc in r:
+        size = len(sc)
+        end  = start + size
+
+        # Create the segment
+        seg.start_ea = start
+        seg.end_ea   = end
+        seg.bitness  = 1 # 32-bit
+        idaapi.add_segm_ex(seg, "obj_%d_%d_%d" % (id, ver, n), "CODE", 0)
+
+        # Copy the bytes
+        idaapi.mem2base(sc, start, end)
+
+        # Mark for analysis
+        idc.AutoMark(start, idc.AU_CODE)
+
+        # Compute next loading address
+        start = ((end // 0x1000) + 1) * 0x1000
+
+    # Select the bochs debugger
+    idc.load_debugger("bochs", 0)
+
+    return 1
+
+# -----------------------------------------------------------------------
+def test1(sample = SAMPLE1):
+    # open the file
+    f = file(sample, 'rb')
+    buf = f.read()
+    f.close()
+
+    # find all JS stream references
+    r = find_js_ref_streams(buf)
+    if not r:
+        return
+
+    for id, ver in r:
+        obj = find_obj(buf, id, ver)
+
+        # extract the JS stream object
+        f = file('obj_%d_%d.bin' % (id, ver), 'wb')
+        f.write(obj)
+        f.close()
+
+        # decode the stream
+        stream = decompress_stream(obj)
+        f = file('dec_%d_%d.bin' % (id, ver), 'wb')
+        f.write(stream)
+        f.close()
+
+        # extract shell code
+        scs = extract_shellcode(stream)
+        i = 0
+        for sc in scs:
+            i += 1
+            f = file('sh_%d_%d_%d.bin' % (id, ver, i), 'wb')
+            f.write(sc)
+            f.close()
+
+# -----------------------------------------------------------------------
+def test2(sample = SAMPLE1):
+    # open the file
+    f = file(sample, 'rb')
+    buf = f.read()
+    f.close()
+
+    r = extract_pdf_shellcode(buf)
+    for id, ver, n, sc in r:
+        print("sc %d.%d[%d]=%d" % (id, ver, n, len(sc)))
+
+# -----------------------------------------------------------------------
+def test3(sample = SAMPLE2):
+    # open the file
+    f = file(sample, 'rb')
+    buf = f.read()
+    f.close()
+    t = find_embedded_js(buf)
+    print(t)
+
+# -----------------------------------------------------------------------
+def main():
+    test1(SAMPLE1)
+
+if not ida:
+    main()