new parser

2021-06-21 12:58:34 +03:00
parent 96dda8ec5e
commit 58f1aad525
1 changed files with 143 additions and 27 deletions
--- a/parse.py
+++ b/parse.py
@@ -1,9 +1,10 @@
+import enum
 from pathlib import Path
 from datetime import datetime
 import re
 import json

-entry_re = re.compile(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})', re.MULTILINE)
+entry_re = re.compile(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ?', re.MULTILINE)

 curr_day = ''

@@ -143,25 +144,137 @@ entry_modules = {
    'notify': create_entry_module_parser('notify', parse_notify),
 }

+from functools import reduce, partial
+
+def split_keep(delims, string):
+    res = []
+    buf = []
+    for c in string:
+        if c in delims:
+            if buf:
+                res.append(''.join(buf))
+            res.append(c)
+            buf = []
+        else:
+            buf.append(c)
+    if buf:
+        res.append(''.join(buf))
+    return res
+
+assert split_keep(['@', '\n'], 'hello @world\n\nabout') == ['hello ', '@', 'world', '\n', '\n', 'about']
+
+def merge_chars(chars, l):
+    res = []
+    for i in l:
+        if i in chars and res and all(c == i for c in res[-1]):
+            res[-1] += i
+        else:
+            res.append(i)
+    return res
+
+assert merge_chars('\n', ['\n', '\n', 'hello', 'world', '\n', '\n']) == ['\n\n', 'hello', 'world', '\n\n']
+
+def attach_to_next(c, l):
+    l = l.copy()
+
+    try:
+        while True:
+            i = l.index(c)
+            l[i+1] = c + l[i+1]
+            l.pop(i)
+    except:
+        pass
+
+    return l
+    
+assert attach_to_next('@', ['aoeu', '@', 'oeu']) == ['aoeu', '@oeu']
+
+def attach_to_prev_if(pred, l):
+    res = []
+
+    for i, curr in enumerate(l):
+        prev = l[i-1] if i-1 >= 0 else None
+        if prev and pred(prev, curr):
+            res[-1] += curr
+        else:
+            res.append(curr)
+
+    return res
+
+assert attach_to_prev_if(lambda p, c: p[-1] != '\n' and c[0] == '@', ['aoeu', '@oeu']) == ['aoeu@oeu']
+
+def merge_notes_block(l):
+    res = []
+
+    i = 0
+    while i < len(l):
+        if l[i] == '@notes':
+            # notes nl source nl title
+            res.append('\n'.join([l[i], l[i+2], l[i+4]]))
+            i += 5
+        else:
+            res.append(l[i])
+            i += 1
+
+    return res
+
+def merge_wrapped_lines(l):
+    res = []
+
+    i = 0
+    while i < len(l):
+        curr = l[i]
+        prev = l[i-1] if i > 0 else None
+        next = l[i+1] if i+1 < len(l) else None
+
+        if prev and next and curr == '\n':
+            len_prev = len(prev)
+            if i == 1:
+                len_prev += len('2020-02-02 02:02:02 ')
+
+            if not next[0].isspace():
+                next_word = next.split()[0]
+                if len_prev + len(next_word) >= 80:
+                    res[-1] += ' ' + next
+                    i += 2
+                    continue
+
+        res.append(curr)
+        i += 1
+
+    return res
+
+def apply(f, x):
+    return f(x)
+
+def flip(f):
+    return lambda a1, a2: f(a2, a1)
+
 def parse_entry(entry):
    result = {}

    def split_into_blocks(text):
-        result = []
-
-        for block in re.split(r'\n{2,}', text):
-            block = block.strip()
-            if not block:
-                continue
-
-            for i, module in enumerate(block.replace(' @', '\n@').split('\n@')):
-                #module = module.strip().replace('\n', ' ')
-                if i == 0:
-                    result.append(module)
-                else:
-                    result.append('@'+module)
-
-        return result
+        r = reduce(flip(apply), [
+            # split the text into sections by newline and tag symbol, keeping the separators
+            partial(split_keep, ('\n', '@')), 
+            # merge sequential newlines together into a single whitespace block 
+            partial(merge_chars, '\n'), 
+            # attach escaped tag symbols
+            partial(attach_to_prev_if, lambda p, c: c == '@' and p[-1] == '\\'), 
+            # attach tag symbols
+            partial(attach_to_next, '@'), 
+            # ???
+            partial(attach_to_prev_if, lambda p, c: p[-1] != '\n' and not (p[0] == '@' and p[-1] == ' ') and c[0] == '@'), 
+            # yes
+            merge_notes_block,
+            # strip all non-whitespace blocks
+            partial(map, lambda s: s if s.isspace() else s.rstrip()), list,
+            # yes
+            merge_wrapped_lines,
+            # remove trailing whitespace block
+            lambda b: b if b and not all(c == '\n' for c in b[-1]) else b[:-1],
+        ], text)
+        return r

    timestamp, content = entry

@@ -169,7 +282,7 @@ def parse_entry(entry):
    result['blocks'] = []

    for b in split_into_blocks(content):
-        if b[0] == '@':
+        if b.startswith('@'):
            tag = b.split()[0][1:]
            result['blocks'].append(entry_modules[tag](b))
        else:
@@ -177,20 +290,23 @@ def parse_entry(entry):

    return result

-result = {}
-
-for fpath in list(sorted((Path.home() / 'workspace' / 'journal').glob('*.md'))):
-    curr_day = fpath.stem
-
-    header, *tmp = entry_re.split(fpath.read_text())
+def parse_page(text):
+    header, *tmp = entry_re.split(text)
    entries = list(zip(tmp[::2], tmp[1::2]))

-    result[fpath.stem] = {
+    return {
        'header': parse_header(header),
        'entries': [parse_entry(e) for e in entries],
    }

-script_path = Path(__file__).parent
+if __name__ == '__main__':
+    result = {}

-with open(script_path / 'journal.json', 'w') as fp:
+    for fpath in list(sorted((Path.home() / 'workspace' / 'journal').glob('*.md'))):
+        day = parse_page(fpath.read_text())
+        result[fpath.stem] = day
+
+    script_path = Path(__file__).parent
+
+    with open(script_path / 'journal.json', 'w') as fp:
        json.dump(result, fp, indent=4, ensure_ascii=False)