journal/parse.py

import enum
from pathlib import Path
from datetime import datetime
import re
import json

entry_re = re.compile(r'^(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}) ?', re.MULTILINE)

curr_day = ''


def parse_godword(godword):
    return godword.split()

def parse_habits(habits):
    result = {}
    for habit in habits.splitlines():
        value, name = habit.split(maxsplit=1)
        name = name.strip()
        result[name] = value[1] == 'x'
    return result

def parse_notifications(notifications):
    result = []
    for notification in notifications.splitlines():
        parts = notification.split()
        result.append({
            'source': ' '.join(parts[0:2]).strip('[]'),
            'message': ' '.join(parts[2:]),
        })
    return result

def parse_tasks(tasks):
    result = {}
    for task in tasks.splitlines():
        value, name = task.split(maxsplit=1)
        name = name.strip()
        result[name] = value[1] == 'x'
    return result

header_modules = {
    'godword': parse_godword,
    'habits': parse_habits,
    'notifications': parse_notifications,
    'tasks': parse_tasks,
}

def parse_header(header):
    result = {}

    def split_into_blocks(text):
        return [b.strip() for b in re.split(r'\n{2,}', text) if b.strip() != '']

    title, *modules = split_into_blocks(header)

    for module in modules:
        name, value = module.split('\n', maxsplit=1)
        name = name.lower().removesuffix(':')
        result[name] = header_modules[name](value)

    return result

def parse_timestamp(timestamp):
    return datetime.strptime(timestamp, '%Y-%m-%d %H:%M:%S')

def parse_post(block):
    block = block.removeprefix('@post ')
    try:
        timestamp = int(parse_timestamp(block[:19]).timestamp())
        block = block[19:]
    except:
        timestamp = None

    content = block.strip()

    result = {}
    if content:
        result['content'] = content
    if timestamp:
        result['timestamp'] = timestamp
    return result

def parse_notes(block):
    tag, source, title = block.splitlines()
    return {'source': source, 'title': title}

def parse_diet(block):
    tag, amount, food = block.split()
    amount = int(amount.removesuffix('g'))
    return {'amount': amount, 'food': food}

def parse_timer(block):
    tag, *rest = block.split()

    name = None
    timestamp = None
    if len(rest) > 2:
        name, *rest = rest
    if len(rest) > 1:
        timestamp = int(parse_timestamp(' '.join(rest)).timestamp())

    result = {}
    if name:
        result['name'] = name
    if timestamp:
        result['timestamp'] = timestamp
    return result

def parse_exercise(block):
    tag, *parts = block.split()

    if parts[0] == 'walk':
        kind, minutes, distance, steps = parts
        return {
            'kind': kind,
            'minutes': int(minutes.removesuffix('min')),
            'distance': float(distance.removesuffix('km')),
            'steps': int(steps.removesuffix('steps')),
        }

    return {'kind': 'INVALID'}

def parse_notify(block):
    tag, day, *rest = block.split()

    return {'day': day.strip(), 'message': ' '.join(rest)}

def create_entry_module_parser(name, handler=None):
    handler = handler or (lambda b: {'value': b.removeprefix(f'@{name} ')})
    return lambda b: {'type': name} | handler(b)

entry_modules = {
    'hide': create_entry_module_parser('hide', lambda _: {}),
    'post': create_entry_module_parser('post', parse_post),
    'info': create_entry_module_parser('info'),
    'notes': create_entry_module_parser('notes', parse_notes),
    'behavior': create_entry_module_parser('behavior'),
    'diet': create_entry_module_parser('diet', parse_diet),
    'task': create_entry_module_parser('task'),
    'start': create_entry_module_parser('start', parse_timer),
    'stop': create_entry_module_parser('stop', parse_timer),
    'done': create_entry_module_parser('done', parse_timer),
    'exercise': create_entry_module_parser('exercise', parse_exercise),
    'notify': create_entry_module_parser('notify', parse_notify),
}

from functools import reduce, partial

def split_keep(delims, string):
    res = []
    buf = []
    for c in string:
        if c in delims:
            if buf:
                res.append(''.join(buf))
            res.append(c)
            buf = []
        else:
            buf.append(c)
    if buf:
        res.append(''.join(buf))
    return res

assert split_keep(['@', '\n'], 'hello @world\n\nabout') == ['hello ', '@', 'world', '\n', '\n', 'about']

def merge_chars(chars, l):
    res = []
    for i in l:
        if i in chars and res and all(c == i for c in res[-1]):
            res[-1] += i
        else:
            res.append(i)
    return res

assert merge_chars('\n', ['\n', '\n', 'hello', 'world', '\n', '\n']) == ['\n\n', 'hello', 'world', '\n\n']

def attach_to_next(c, l):
    l = l.copy()

    try:
        while True:
            i = l.index(c)
            l[i+1] = c + l[i+1]
            l.pop(i)
    except:
        pass

    return l

assert attach_to_next('@', ['aoeu', '@', 'oeu']) == ['aoeu', '@oeu']

def attach_to_prev_if(pred, l):
    res = []

    for i, curr in enumerate(l):
        prev = l[i-1] if i-1 >= 0 else None
        if prev and pred(prev, curr):
            res[-1] += curr
        else:
            res.append(curr)

    return res

assert attach_to_prev_if(lambda p, c: p[-1] != '\n' and c[0] == '@', ['aoeu', '@oeu']) == ['aoeu@oeu']

def merge_notes_block(l):
    res = []

    i = 0
    while i < len(l):
        if l[i] == '@notes':
            # notes nl source nl title
            res.append('\n'.join([l[i], l[i+2], l[i+4]]))
            i += 5
        else:
            res.append(l[i])
            i += 1

    return res

def merge_wrapped_lines(l):
    res = []

    i = 0
    while i < len(l):
        curr = l[i]
        prev = l[i-1] if i > 0 else None
        next = l[i+1] if i+1 < len(l) else None

        if prev and next and curr == '\n':
            len_prev = len(prev)
            if i == 1:
                len_prev += len('2020-02-02 02:02:02 ')

            if not next[0].isspace():
                next_word = next.split()[0]
                if len_prev + len(next_word) >= 80:
                    res[-1] += ' ' + next
                    i += 2
                    continue

        res.append(curr)
        i += 1

    return res

def apply(f, x):
    return f(x)

def flip(f):
    return lambda a1, a2: f(a2, a1)

def parse_entry(entry):
    result = {}

    def split_into_blocks(text):
        r = reduce(flip(apply), [
            # split the text into sections by newline and tag symbol, keeping the separators
            partial(split_keep, ('\n', '@')),
            # merge sequential newlines together into a single whitespace block
            partial(merge_chars, '\n'),
            # attach escaped tag symbols
            partial(attach_to_prev_if, lambda p, c: c == '@' and p[-1] == '\\'),
            # attach tag symbols
            partial(attach_to_next, '@'),
            # ???
            partial(attach_to_prev_if, lambda p, c: p[-1] != '\n' and not (p[0] == '@' and p[-1] == ' ') and c[0] == '@'),
            # yes
            merge_notes_block,
            # strip all non-whitespace blocks
            partial(map, lambda s: s if s.isspace() else s.rstrip()), list,
            # yes
            merge_wrapped_lines,
            # remove trailing whitespace block
            lambda b: b if b and not all(c == '\n' for c in b[-1]) else b[:-1],
        ], text)
        return r

    timestamp, content = entry

    result['timestamp'] = int(parse_timestamp(timestamp.strip()).timestamp())
    result['blocks'] = []

    for b in split_into_blocks(content):
        if b.startswith('@'):
            tag = b.split()[0][1:]
            result['blocks'].append(entry_modules[tag](b))
        else:
            result['blocks'].append(b)

    return result

def parse_page(text):
    header, *tmp = entry_re.split(text)
    entries = list(zip(tmp[::2], tmp[1::2]))

    return {
        'header': parse_header(header),
        'entries': [parse_entry(e) for e in entries],
    }

if __name__ == '__main__':
    result = {}

    for fpath in list(sorted((Path.home() / 'workspace' / 'journal').glob('*.md'))):
        day = parse_page(fpath.read_text())
        result[fpath.stem] = day

    script_path = Path(__file__).parent

    with open(script_path / 'journal.json', 'w') as fp:
        json.dump(result, fp, indent=4, ensure_ascii=False)