basic author/timestamp/msg deduping

author: Brian Harring <ferringb@google.com> 2012-10-16 00:51:19 -0700
committer: Brian Harring <ferringb@google.com> 2012-10-16 13:28:49 -0700
commit: dbd12bae8f49b2e7c0f3fded7a69d5c2d580ca1e (patch)
tree: 8d8e20bd43a3c3a18f8fb209c794e0938d8d1aa4
parent: refactoring; bypass the commit creation in each repo and linearization by git... (diff)
download: git-conversion-tools-dbd12bae8f49b2e7c0f3fded7a69d5c2d580ca1e.tar.gz
git-conversion-tools-dbd12bae8f49b2e7c0f3fded7a69d5c2d580ca1e.tar.bz2
git-conversion-tools-dbd12bae8f49b2e7c0f3fded7a69d5c2d580ca1e.zip
1 files changed, 42 insertions, 5 deletions
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index f657a8e..d1ecf01 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -1,5 +1,6 @@
 #!/usr/bin/python
 import functools
+import itertools
 import operator
 import os
 import re
@@ -15,7 +16,16 @@ mangler.append(functools.partial(
     r"Package-Manager: portage-\1"))
 
 fields = ('author', 'committer', 'msg', 'files', 'timestamp')
-record = namedtuple('record', fields)
+fields_map = dict((attr, idx) for idx, attr in enumerate(fields))
+file_idx = fields_map['files']
+class record(namedtuple('record', fields)):
+  def safe_combine(self, other, file_idx=fields_map['files']):
+    files = self.files.copy()
+    assert not set(files).intersection(other.files), (files, other.files)
+    files.update(other.files)
+    items = list(self)
+    items[file_idx] = files
+    return self.__class__(*items)
 
 def deserialize_records(source, blob_idx):
   line = source.readline()
@@ -117,6 +127,24 @@ def deserialize_blob_map(source):
   source = (x.strip().split() for x in source)
   return dict((int(x[0].lstrip(':')), x[1]) for x in source)
 
+def simple_dedup(records):
+  # dedup via timestamp/author/msg
+  dupes = {}
+  for idx, record in enumerate(records):
+    dupes.setdefault((record.timestamp, record.author, record.msg), []).append((idx, record))
+  mangled = []
+  for key, value in dupes.iteritems():
+    if len(value) == 1:
+      continue
+    value.sort(key=operator.itemgetter(0))
+    combined = value[0][1]
+    for idx, item in value[1:]:
+      combined = combined.safe_combine(item)
+    value[:] = [(value[0][0], combined)]
+    mangled.append((key, value))
+  l = itertools.imap(operator.itemgetter(0), dupes.itervalues())
+  return itertools.imap(operator.itemgetter(1), sorted(l, key=operator.itemgetter(0)))
+
 def main(argv):
   records = []
   source = argv if argv else sys.stdin
@@ -127,10 +155,19 @@ def main(argv):
     if not os.path.exists(commits):
       sys.stderr.write("skipping %s; no commit data\n" % directory)
       continue
-    blob_index = deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx')))
-    records.extend(deserialize_records(open(commits, 'r'), blob_index))
-  records.sort(key=operator.attrgetter('timestamp'))
-  #records = list(deserialize_records(source))
+    records.extend(
+      deserialize_records(
+        open(commits, 'r'),
+        deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx')))
+      )
+    )
+  sorter = operator.attrgetter('timestamp')
+  # Get them into timestamp ordering first; this is abusing python stable
+  # sort pretty much since any commits to the same repo w/ the same timestamp
+  # will still have their original ordering (just that chunk will be moved).
+  # This allows us to combine the history w/out losing the ordering per repo.
+  records.sort(key=sorter)
+  records[:] = simple_dedup(records)
   serialize_records(records, sys.stdout)
   return 0
author	Brian Harring <ferringb@google.com>	2012-10-16 00:51:19 -0700
committer	Brian Harring <ferringb@google.com>	2012-10-16 13:28:49 -0700
commit	dbd12bae8f49b2e7c0f3fded7a69d5c2d580ca1e (patch)
tree	8d8e20bd43a3c3a18f8fb209c794e0938d8d1aa4
parent	refactoring; bypass the commit creation in each repo and linearization by git... (diff)
download	git-conversion-tools-dbd12bae8f49b2e7c0f3fded7a69d5c2d580ca1e.tar.gz git-conversion-tools-dbd12bae8f49b2e7c0f3fded7a69d5c2d580ca1e.tar.bz2 git-conversion-tools-dbd12bae8f49b2e7c0f3fded7a69d5c2d580ca1e.zip