hugo/static/wasm/pico/compress.py

#!/usr/bin/env python2

import argparse
import heapq
import collections
import struct
import sys

min_match = 10
max_run = 15
max_dist = 512
use_len_huffman = False

groupinto = lambda l, n: [tuple(l[i:i+n]) for i in range(0, len(l), n)]
combine = lambda l, shift: reduce(lambda x,y: (x<<shift)|y, l[::-1], 0)
wasm = lambda b: ('\\%02x' % (b))
padz = lambda l, s: '0' * (l - len(s)) + s
padbin = lambda l, n: padz(l, bin(n)[2:])
tobits = lambda l, n: map(int, bin(n)[2:])[::-1] + ([0] * (l - len(bin(n)) + 2))
tobytes = lambda data: [combine(x, 1) for x in groupinto(data, 8)]

def read_file(f):

  data = open(f).readlines()
  # Not gonna be super robust here...
  # Assume the first four lines are:
  #    P3
  #    # Comment from GIMP
  #    <width> <height>
  #    <max color>
  #    <actual data, one byte per line>
  width, height = map(int, data[2].split())
  data = map(int, data[4:])

  # Convert colors into tuples of (R, G, B)
  data = groupinto(data, 3)

  # Extra colors into a palette
  colors = list(set(data))
  # Sort colors so white is first.
  colors.sort(reverse=True)
  assert(len(colors) <= 4)

  # Map pixel data to palette index
  return [colors.index(x) for x in data]


class Node(object):
  def __init__(self, key, l=None, r=None):
    self.key = key
    self.l = l
    self.r = r

def make_huffman(dic):
  if len(dic) == 0: return {}

  heap = []
  for key, count in dic.iteritems():
    heapq.heappush(heap, (count, Node(key)))

  while len(heap) >= 2:
    count1, node1 = heapq.heappop(heap)
    count2, node2 = heapq.heappop(heap)
    heapq.heappush(heap, (count1 + count2, Node(None, node1, node2)))

  # Decode huffman into encoding dict
  result = {}
  def Build(node, cur):
    if node.key is not None:
      result[node.key] = cur
    else:
      assert(node.l is not None and node.r is not None)
      Build(node.l, cur + '0')
      Build(node.r, cur + '1')

  Build(heap[0][1], '')

  # Canonicalize huffman encoding
  max_len = max(len(code) for code in result.values()) + 1
  lens = [[] for _ in range(max_len)]
  for key, code in result.iteritems():
    lens[len(code)].append(key)

  # print([(i, len(x)) for i, x in enumerate(lens)])

  result = {}
  code = 0
  for l, keys in enumerate(lens):
    for key in keys:
      result[key] = padbin(l, code)
      # print('%s=>%s' % (key, result[key]))
      code += 1
    code <<= 1

  return result

def align_byte(n):
  return (n + 7) & ~7

def huffman_cost(huff):
  max_len = max(len(code) for code in huff.values()) + 1
  lens = [0] * max_len
  for key, code in huff.iteritems():
    lens[len(code)] += 1

  sums = [0]
  for l in lens:
    sums.append(sums[-1] + l)
  sums = sums[1:]

  keybits = log2(max([-key for key in huff.keys()] + huff.keys())) + 1
  sumbits = log2(max(sums)) + 7

  # keybits = align_byte(keybits)
  # sumbits = align_byte(sumbits)
  return len(huff) * keybits + len(sums) * (sumbits * 2)


def lz77(runs):
  # Find repeated patterns
  pattern_def = {}
  result = runs[:]
  for l in range(min_match, len(runs)):
    found = False
    for i in range(len(runs) - l):
      pattern = tuple(runs[i:i+l])
      if pattern in pattern_def:
        dist = pattern_def[pattern] - i
        if dist > -max_dist:
          found = True
          result[i] = (dist, len(pattern))

      pattern_def[pattern] = i

    if not found:
      break

  # Determine path that uses the most patterns
  score = [0] * (len(runs) + 1)
  back = [0] * (len(runs) + 1)
  for i, value in enumerate(result):
    if type(value) is tuple:
      l = value[1]
      if score[i] + l > score[i + l]:
        score[i + l] = score[i] + l
        back[i + l] = i
    elif score[i] > score[i + 1]:
      score[i + 1] = score[i]
      back[i + 1] = i

  # Reconstruct path
  final = [None] * len(runs)
  next = len(runs)
  while back[next] != 0:
    next = back[next]
    final[next] = result[next]
  for i in range(next):
    final[i] = result[i]

  # Remove Nones
  new_final = []
  for val in final:
    if val is not None:
      new_final.append(val)

  print('original (%d items) %s\n' % (len(runs), runs))
  print('compressed (%d items) %s\n' % (len(new_final), new_final))

  offsets = collections.defaultdict(int)
  lens = collections.defaultdict(int)

  offset_set = set()
  len_set = set()
  lit_set = set()

  for t in new_final:
    if type(t) is tuple:
      offsets[t[0]] += 1
      offset_set.add(t[0])
      len_set.add(t[1])
      if use_len_huffman:
        lens[t[1]] += 1
      else:
        offsets[t[1]] += 1
    else:
      offsets[t] += 1
      lit_set.add(t)

  # Calculate fixed-size cost
  fixed_offset = log2(max([-x for x in offset_set]))
  fixed_len = log2(max(len_set))
  fixed_lit = log2(max(lit_set))
  print('offset=%d len=%d lit=%d\n' % (fixed_offset, fixed_len, fixed_lit))

  # Create huffman trees
  offset_huffman = make_huffman(offsets)
  print('huffman (%d items) %s\n' % (len(offset_huffman), offset_huffman))

  if use_len_huffman:
    len_huffman = make_huffman(lens)
    print('len huffman (%d items) %s\n' % (len(len_huffman), len_huffman))
  else:
    len_huffman = offset_huffman

  data = []
  bits = ''
  total = 0
  was_tuple = True
  for i, t in enumerate(new_final):
    if was_tuple:
      total += fixed_offset
      count = 0
      while i < len(new_final):
        if type(new_final[i]) is tuple:
          break
        count += 1
        i += 1
      bits += padbin(fixed_offset, count) + ' '
      data += tobits(fixed_offset, count)

    if type(t) is tuple:
      was_tuple = True
      total += fixed_offset + fixed_len
      # total += len(offset_huffman[t[0]]) + len(len_huffman[t[1]])
      # bits += '%s|%s ' % (offset_huffman[t[0]], len_huffman[t[1]])
      bits += '%s|%s ' % (padbin(fixed_offset, -t[0]), padbin(fixed_len, t[1]))
      data += tobits(fixed_offset, -t[0]) + tobits(fixed_len, t[1])
    else:
      was_tuple = False
      total += fixed_lit
      # total += len(offset_huffman[t])
      # bits += offset_huffman[t] + ' '
      bits += padbin(fixed_lit, t) + ' '
      data += tobits(fixed_lit, t)

  print('(1bits/pixel) => %d bytes' % (sum(runs) / 8))
  print('  (4bits/run) => %d bytes' % (len(runs) / 2))
  # print('  (w/o table) => %d bytes (+%d bits)' % (total / 8, total % 8))

  # total += huffman_cost(offset_huffman)
  # if use_len_huffman:
  #   total += huffman_cost(len_huffman)

  # print(' (compressed) => %d bytes (+%d bits)' % (total / 8, total % 8))

  print('savings = %d' % (total / 8 - sum(runs) / 8))
  print(' (no huffman) => %d bytes (+%d bits)' % (total / 8, total % 8))
  print('savings = %d\n' % (total / 8 - sum(runs) / 8))

  print('encoded (no huffman): %s\n' % bits)
  # print('encoded (huffman): %s\n' % bits)
  # print(data)
  # print(map(hex, tobytes(data)))

  data = tobytes(data)
  data = ''.join(('"%s"\n' % ''.join(wasm(j) for j in data[i:i+24]))
                  for i in range(0, len(data), 24))
  print(data)


def log2(n):
  t = 0
  while n > 0:
    n >>= 1
    t += 1
  return t


def lzw(runs):
  dic = dict([((x,), x) for x in range(16)])
  i = 0
  result = []
  seq = []
  total = 0
  while i < len(runs) - 1:
    new_seq = seq + [runs[i]]
    tup_new_seq = tuple(new_seq)
    if tup_new_seq not in dic:
      code = dic[tuple(seq)]
      if code > 15:
        print '%d => %s' % (code, tuple(seq))
      result.append(code)
      dic[tup_new_seq] = len(dic)
      seq = [runs[i]]
    else:
      seq = new_seq
    i += 1
    total += log2(len(dic))

  print('%d bits = %d bytes' % (4 * len(runs), len(runs)/2), len(runs), runs)
  print('%d bits = %d bytes' % (total, total/8), len(result), result)


def main(args):
  parser = argparse.ArgumentParser()
  parser.add_argument('-m', '--min-match', type=int, default=10)
  parser.add_argument('-M', '--max-run', type=int, default=15)
  parser.add_argument('-d', '--max-dist', type=int, default=512)
  parser.add_argument('files', nargs='*')
  args = parser.parse_args(args)

  global min_match, max_run, max_dist
  min_match = args.min_match
  max_run = args.max_run
  max_dist = args.max_dist

  offsets = []
  data = []
  for f in args.files:
    offsets.append((f, len(data)))
    data.extend(read_file(f))

  if False:
    open('combined.dat', 'wb').write(''.join(map(chr, tobytes(data))))

  # Find run lengths
  last = 0
  runs = [0]
  for bit in data:
    if bit == last and runs[-1] < max_run:
      runs[-1] += 1
    else:
      if bit == last:
        runs.append(0)
      last = bit
      runs.append(1)

  lz77(runs)
  print('\n'.join(';; %s => %d' % t for t in offsets))


if __name__ == '__main__':
  sys.exit(main(sys.argv[1:]))