#!/usr/bin/python -u # # Created by Bjarni R. Einarsson, placed in the public domain. Go wild! # import json import os import sys try: dirtydb_input = sys.argv[1] dirtydb_output = '%s.new' % dirtydb_input assert(os.path.exists(dirtydb_input)) assert(not os.path.exists(dirtydb_output)) except: print print 'Usage: %s /path/to/dirty.db' % sys.argv[0] print print 'Note: Will create a file named dirty.db.new in the same folder,' print ' please make sure permissions are OK and a file by that' print ' name does not exist already. This script works by omitting' print ' duplicate lines from the dirty.db file, keeping only the' print ' last (latest) instance. No revision data should be lost,' print ' but be careful, make backups. If it breaks you get to keep' print ' both pieces!' print sys.exit(1) dirtydb = {} lines = 0 with open(dirtydb_input, 'r') as fd: print 'Reading %s' % dirtydb_input for line in fd: lines += 1 data = json.loads(line) dirtydb[data['key']] = line if lines % 10000 == 0: sys.stderr.write('.') print print 'OK, found %d unique keys in %d lines' % (len(dirtydb), lines) with open(dirtydb_output, 'w') as fd: for data in dirtydb.values(): fd.write(data) print 'Wrote data to %s. All done!' % dirtydb_output