changeset 101:856945a6d936

simplify revision data size handling - just store it into metadata Back when the storage api was implemented, size was implemented as a property. I can't remember why this was done, but I assume it was to do backend-specific optimization to quickly access revision data size without accessing metadata. As we'll have an index for most important metadata and we likely have to open metadata anyway, SIZE is now just a metadata entry like all the others, not specialcased any more. Like the revision data hash, SIZE is also automatically stored into the revision's metadata when committing it. This simplifies the code quite a bit and makes it more generic.
author Thomas Waldmann <tw AT waldmann-edv DOT de>
date Fri, 11 Mar 2011 03:52:14 +0100
parents 1857720a1372
children c084c1369078
files MoinMoin/apps/admin/views.py MoinMoin/items/__init__.py MoinMoin/storage/__init__.py MoinMoin/storage/_tests/test_backends.py MoinMoin/storage/_tests/test_backends_hg.py MoinMoin/storage/_tests/test_serialization.py MoinMoin/storage/backends/acl.py MoinMoin/storage/backends/fileserver.py MoinMoin/storage/backends/flatfile.py MoinMoin/storage/backends/fs.py MoinMoin/storage/backends/fs19.py MoinMoin/storage/backends/fs2.py MoinMoin/storage/backends/hg.py MoinMoin/storage/backends/memory.py MoinMoin/storage/backends/router.py MoinMoin/storage/backends/sqla.py
diffstat 16 files changed, 36 insertions(+), 96 deletions(-) [+]
line wrap: on
line diff
--- a/MoinMoin/apps/admin/views.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/apps/admin/views.py	Fri Mar 11 03:52:14 2011 +0100
@@ -22,7 +22,7 @@
 from MoinMoin.apps.admin import admin
 from MoinMoin import user
 from MoinMoin.storage.error import NoSuchRevisionError
-
+from MoinMoin.items import SIZE
 
 @admin.route('/')
 def index():
@@ -239,7 +239,7 @@
             # XXX we currently also get user items, they have no revisions -
             # but in the end, they should not be readable by the user anyways
             continue
-        rows.append((rev.size, item.name))
+        rows.append((rev[SIZE], item.name))
     rows = sorted(rows, reverse=True)
     return render_template('admin/itemsize.html',
                            item_name="+admin/itemsize",
--- a/MoinMoin/items/__init__.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/items/__init__.py	Fri Mar 11 03:52:14 2011 +0100
@@ -695,7 +695,7 @@
             mt = wikiutil.MimeType(mimestr=mimestr)
             content_disposition = mt.content_disposition(app.cfg)
             content_type = mt.content_type()
-            content_length = rev.size
+            content_length = rev[SIZE]
             file_to_send = rev
 
         # TODO: handle content_disposition is not None
--- a/MoinMoin/storage/__init__.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/__init__.py	Fri Mar 11 03:52:14 2011 +0100
@@ -51,6 +51,10 @@
                                    BackendError, NoSuchItemError, \
                                    RevisionAlreadyExistsError, ItemAlreadyExistsError
 
+#XXX doesn't work, cyclic imports:
+#from MoinMoin.items import SIZE
+SIZE = "size"
+
 # we need a specific hash algorithm to store hashes of revision data into meta
 # data. meta[HASH_ALGORITHM] = hash(rev_data, HASH_ALGORITHM)
 # some backends may use this also for other purposes.
@@ -417,18 +421,6 @@
         """
         raise NotImplementedError()
 
-    def _get_revision_size(self, revision):
-        """
-        Lazily access the revision's data size. This needs not be implemented
-        if all StoredRevision objects are instantiated with the size= keyword
-        parameter.
-
-        :type revision: Object of a subclass of Revision.
-        :param revision: The revision on which we want to operate.
-        :returns: int
-        """
-        raise NotImplementedError()
-
     def _seek_revision_data(self, revision, position, mode):
         """
         Set the revision's cursor on the revision's data.
@@ -467,8 +459,6 @@
             for k, v in rev1.iteritems():
                 if rev2[k] != v:
                     return False
-            if rev1.size != rev2.size:
-                return False
             return True
 
         if name is None:
@@ -741,6 +731,7 @@
         rev = self._uncommitted_revision
         assert rev is not None
         rev[HASH_ALGORITHM] = unicode(rev._rev_hash.hexdigest())
+        rev[SIZE] = rev._size
         self._backend._commit_item(rev)
         self._uncommitted_revision = None
 
@@ -862,12 +853,11 @@
     Do NOT create instances of this class directly, but use item.get_revision or
     one of the other methods intended for getting stored revisions.
     """
-    def __init__(self, item, revno, timestamp=None, size=None):
+    def __init__(self, item, revno, timestamp=None):
         """
         Initialize the StoredRevision
         """
         Revision.__init__(self, item, revno, timestamp)
-        self._size = size
 
     def _get_ts(self):
         if self._timestamp is None:
@@ -876,15 +866,6 @@
 
     timestamp = property(_get_ts, doc="This property returns the creation timestamp of the revision")
 
-    def _get_size(self):
-        if self._size is None:
-            self._size = self._backend._get_revision_size(self)
-            assert self._size is not None
-
-        return self._size
-
-    size = property(_get_size, doc="Size of revision's data")
-
     def __setitem__(self, key, value):
         """
         Revision metadata cannot be altered, thus, we raise an Exception.
@@ -934,6 +915,8 @@
         """
         Revision.__init__(self, item, revno, None)
         self._metadata = {}
+        # these values need to be kept uptodate to that item.commit() can
+        # use them to update the metadata of the rev before committing it:
         self._size = 0
         self._rev_hash = hashlib.new(HASH_ALGORITHM)
 
@@ -946,11 +929,6 @@
 
     timestamp = property(_get_ts, _set_ts, doc="This property accesses the creation timestamp of the revision")
 
-    def _get_size(self):
-        return self._size
-
-    size = property(_get_size, doc="Size of data written so far")
-
     def __setitem__(self, key, value):
         """
         Internal method used for dict-like access to the NewRevisions metadata-dict.
--- a/MoinMoin/storage/_tests/test_backends.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/_tests/test_backends.py	Fri Mar 11 03:52:14 2011 +0100
@@ -24,6 +24,7 @@
 from MoinMoin.storage.backends import memory
 from MoinMoin.storage.error import NoSuchItemError, ItemAlreadyExistsError, NoSuchRevisionError, RevisionAlreadyExistsError
 from MoinMoin.storage import terms
+from MoinMoin.items import SIZE
 
 item_names = (u"quite_normal",
               u"äöüßłóąćółąńśćżź",
@@ -549,32 +550,26 @@
         item = self.backend.create_item(u'size1')
         rev = item.create_revision(0)
         rev.write('asdf')
-        assert rev.size == 4
         rev.write('asdf')
-        assert rev.size == 8
         item.commit()
         rev = item.get_revision(0)
-        assert rev.size == 8
+        assert rev[SIZE] == 8
 
         for nrev in self.backend.history():
-            assert nrev.size == 8
+            assert nrev[SIZE] == 8
 
     def test_size_2(self):
         item = self.backend.create_item(u'size2')
         rev0 = item.create_revision(0)
         data0 = 'asdf'
         rev0.write(data0)
-        assert rev0.size == len(data0)
         item.commit()
         rev1 = item.create_revision(1)
-        rev1["size"] = "should be 0" # invalid
-        assert rev1.size == 0
-        data1 = '' # we never write this to the rev1!
         item.commit()
         rev1 = item.get_revision(1)
-        assert rev1.size == len(data1)
+        assert rev1[SIZE] == 0
         rev0 = item.get_revision(0)
-        assert rev0.size == len(data0)
+        assert rev0[SIZE] == len(data0)
 
     def test_various_revision_metadata_values(self):
         def test_value(value, no):
--- a/MoinMoin/storage/_tests/test_backends_hg.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/_tests/test_backends_hg.py	Fri Mar 11 03:52:14 2011 +0100
@@ -82,7 +82,7 @@
         item.commit()
         item = self.backend.get_item('existing')
         rev = item.get_revision(-1)
-        assert len(dict(rev)) == 10000 + 1 # 'sha1' key is added automatically on commit
+        assert len(dict(rev)) == 10000 + 2 # 'sha1' and 'size' key is added automatically on commit
         for num in xrange(10000):
             revval = "revision metadata value for key %d" % num
             assert rev["%s" % num] == revval * 10
--- a/MoinMoin/storage/_tests/test_serialization.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/_tests/test_serialization.py	Fri Mar 11 03:52:14 2011 +0100
@@ -39,6 +39,7 @@
                     '<entry key="sha1"><str>763675d6a1d8d0a3a28deca62bb68abd8baf86f3</str>\n</entry>\n'
                     '<entry key="m1"><str>m1</str>\n</entry>\n'
                     '<entry key="name"><str>foo1</str>\n</entry>\n'
+                    '<entry key="size"><int>4</int>\n</entry>\n'
                     '<entry key="uuid"><str>foo1</str>\n</entry>\n'
                     '</meta>\n'
                     '<data coding="base64"><chunk>YmFyMQ==</chunk>\n</data>\n'
@@ -68,6 +69,7 @@
                     '<entry key="sha1"><str>033c4846b506a4a48e32cdf54515c91d3499adb3</str>\n</entry>\n'
                     '<entry key="m1"><str>m1r0</str>\n</entry>\n'
                     '<entry key="name"><str>foo2</str>\n</entry>\n'
+                    '<entry key="size"><int>4</int>\n</entry>\n'
                     '<entry key="uuid"><str>foo2</str>\n</entry>\n'
                     '</meta>\n'
                     '<data coding="base64"><chunk>YmFyMg==</chunk>\n</data>\n'
@@ -78,6 +80,7 @@
                     '<entry key="sha1"><str>f91d8fc20a5de853e62105cc1ee0bf47fd7ded0f</str>\n</entry>\n'
                     '<entry key="m1"><str>m1r1</str>\n</entry>\n'
                     '<entry key="name"><str>foo2</str>\n</entry>\n'
+                    '<entry key="size"><int>4</int>\n</entry>\n'
                     '<entry key="uuid"><str>foo2</str>\n</entry>\n'
                     '</meta>\n'
                     '<data coding="base64"><chunk>YmF6Mg==</chunk>\n</data>\n'
--- a/MoinMoin/storage/backends/acl.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/acl.py	Fri Mar 11 03:52:14 2011 +0100
@@ -452,13 +452,6 @@
 
     timestamp = property(_get_ts, _set_ts, doc="This property accesses the creation timestamp of the revision")
 
-    @property
-    def size(self):
-        """
-        @see: NewRevision.size.__doc__
-        """
-        return self._revision.size
-
     def __setitem__(self, key, value):
         """
         In order to change an ACL on an item you must have the ADMIN privilege.
--- a/MoinMoin/storage/backends/fileserver.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/fileserver.py	Fri Mar 11 03:52:14 2011 +0100
@@ -23,7 +23,7 @@
 from MoinMoin.storage import Backend, Item, StoredRevision
 from MoinMoin.storage.error import NoSuchItemError, NoSuchRevisionError
 
-from MoinMoin.items import ACL, MIMETYPE, ACTION, COMMENT
+from MoinMoin.items import ACL, MIMETYPE, ACTION, COMMENT, SIZE
 MTIME = '__timestamp' # does not exist in storage any more
 
 class FSError(Exception):
@@ -113,9 +113,6 @@
     def _get_revision_timestamp(self, rev):
         return rev._fs_meta[MTIME]
 
-    def _get_revision_size(self, rev):
-        return rev._fs_meta['__size']
-
 
 # Specialized Items/Revisions
 
@@ -161,7 +158,7 @@
         meta = { # make something up
             MTIME: st.st_mtime,
             ACTION: 'SAVE',
-            '__size': st.st_size,
+            SIZE: st.st_size,
         }
         self._fs_meta = meta
         self._fs_data_fname = filepath
--- a/MoinMoin/storage/backends/flatfile.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/flatfile.py	Fri Mar 11 03:52:14 2011 +0100
@@ -101,8 +101,8 @@
         data = open(revpath, 'rb').read()
         rev._metadata, data = split_body(data)
         rev._metadata[ACTION] = 'SAVE'
+        rev._metadata[SIZE] = len(data)
         rev._data = StringIO(data)
-        rev._data_size = len(data)
         return rev
 
     def _list_revisions(self, item):
@@ -176,9 +176,6 @@
         revpath = self._rev_path(rev.item.name)
         return os.stat(revpath).st_ctime
 
-    def _get_revision_size(self, rev):
-        return rev._data_size
-
     def _seek_revision_data(self, rev, position, mode):
         rev._data.seek(position, mode)
 
--- a/MoinMoin/storage/backends/fs.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/fs.py	Fri Mar 11 03:52:14 2011 +0100
@@ -465,11 +465,6 @@
             self._get_revision_metadata(rev)
         return rev._fs_metadata['__timestamp']
 
-    def _get_revision_size(self, rev):
-        if rev._fs_file is None:
-            self._get_revision_metadata(rev)
-        return os.stat(rev._fs_revpath).st_size - rev._datastart
-
     def _seek_revision_data(self, rev, position, mode):
         if rev._fs_file is None:
             self._get_revision_metadata(rev)
--- a/MoinMoin/storage/backends/fs19.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/fs19.py	Fri Mar 11 03:52:14 2011 +0100
@@ -36,7 +36,7 @@
 from MoinMoin.items import ACL, MIMETYPE, UUID, NAME, NAME_OLD, REVERTED_TO, \
                            ACTION, ADDRESS, HOSTNAME, USERID, EXTRA, COMMENT, \
                            IS_SYSITEM, SYSITEM_VERSION, \
-                           TAGS
+                           TAGS, SIZE
 from MoinMoin.storage.backends._fsutils import quoteWikinameFS, unquoteWikiname
 from MoinMoin.storage.backends._flatutils import split_body
 
@@ -272,9 +272,6 @@
     def _get_revision_timestamp(self, rev):
         return rev._fs_meta[MTIME]
 
-    def _get_revision_size(self, rev):
-        return rev._fs_meta['__size']
-
 
 # Specialized Items/Revisions
 
@@ -383,9 +380,9 @@
             meta[SYSITEM_VERSION] = item._syspages
         data = self._process_data(meta, data)
         data = data.encode(config.charset)
-        meta['__size'] = len(data) # needed for converter checks
-        hash_name, hash_digest = hash_hexdigest(data)
+        size, hash_name, hash_digest = hash_hexdigest(data)
         meta[hash_name] = hash_digest
+        meta[SIZE] = size
         self._fs_meta = {}
         for k, v in meta.iteritems():
             if isinstance(v, list):
@@ -486,13 +483,13 @@
                 ACTION: u'SAVE',
             }
         meta = editlog_data
-        meta['__size'] = 0 # not needed for converter
         # attachments in moin 1.9 were protected by their "parent" page's acl
         if item._fs_parent_acl is not None:
             meta[ACL] = item._fs_parent_acl # XXX not needed for acl_hierarchic
         meta[MIMETYPE] = unicode(wikiutil.MimeType(filename=item._fs_attachname).mime_type())
-        hash_name, hash_digest = hash_hexdigest(open(attpath, 'rb'))
+        size, hash_name, hash_digest = hash_hexdigest(open(attpath, 'rb'))
         meta[hash_name] = hash_digest
+        meta[SIZE] = size
         if item._syspages:
             meta[IS_SYSITEM] = True
             meta[SYSITEM_VERSION] = item._syspages
@@ -790,16 +787,19 @@
     return dict(items)
 
 def hash_hexdigest(content, bufsize=4096):
+    size = 0
     hash = hashlib.new(HASH_ALGORITHM)
     if hasattr(content, "read"):
         while True:
             buf = content.read(bufsize)
             hash.update(buf)
+            size += len(buf)
             if not buf:
                 break
     elif isinstance(content, str):
         hash.update(content)
+        size = len(content)
     else:
         raise ValueError("unsupported content object: %r" % content)
-    return HASH_ALGORITHM, unicode(hash.hexdigest())
+    return size, HASH_ALGORITHM, unicode(hash.hexdigest())
 
--- a/MoinMoin/storage/backends/fs2.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/fs2.py	Fri Mar 11 03:52:14 2011 +0100
@@ -420,9 +420,6 @@
     def _get_revision_timestamp(self, rev):
         return rev._fs_metadata['__timestamp']
 
-    def _get_revision_size(self, rev):
-        return os.stat(rev._fs_path_data).st_size
-
     def _open_revision_data(self, rev, mode='rb'):
         if rev._fs_file_data is None:
             rev._fs_file_data = open(rev._fs_path_data, mode) # XXX keeps file open as long as rev exists
--- a/MoinMoin/storage/backends/hg.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/hg.py	Fri Mar 11 03:52:14 2011 +0100
@@ -435,10 +435,6 @@
         """Return given Revision timestamp"""
         return long(self._get_filectx(revision).date()[0])
 
-    def _get_revision_size(self, revision):
-        """Return size of given Revision in bytes."""
-        return self._get_filectx(revision).size()
-
     def _seek_revision_data(self, revision, position, mode):
         """Set the Revisions cursor on the Revisions data."""
         self._open_revision_data(revision)
@@ -652,8 +648,8 @@
 
 class MercurialStoredRevision(StoredRevision):
 
-    def __init__(self, item, revno, timestamp=None, size=None):
-        StoredRevision.__init__(self, item, revno, timestamp, size)
+    def __init__(self, item, revno, timestamp=None):
+        StoredRevision.__init__(self, item, revno, timestamp)
         self._data = None
 
     def get_parents(self):
--- a/MoinMoin/storage/backends/memory.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/memory.py	Fri Mar 11 03:52:14 2011 +0100
@@ -166,7 +166,7 @@
         except KeyError:
             raise NoSuchRevisionError("No Revision #%d on Item %s - Available revisions: %r" % (revno, item.name, revisions))
         else:
-            revision = self.StoredRevision(item, revno, timestamp=metadata['__timestamp'], size=len(data))
+            revision = self.StoredRevision(item, revno, timestamp=metadata['__timestamp'])
             revision._data = StringIO.StringIO(data)
             revision._metadata = metadata
             return revision
--- a/MoinMoin/storage/backends/router.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/router.py	Fri Mar 11 03:52:14 2011 +0100
@@ -347,10 +347,6 @@
 
     timestamp = property(_get_ts, _set_ts, doc="This property accesses the creation timestamp of the revision")
 
-    @property
-    def size(self):
-        return self._revision.size
-
     def __setitem__(self, key, value):
         """
         We only need to redirect this manually here because python doesn't do that
@@ -422,10 +418,6 @@
     def timestamp(self):
         return self._revision.timestamp
 
-    @property
-    def size(self):
-        return self._revision.size
-
     def __getitem__(self, key):
         return self._revision.__getitem__(key)
 
--- a/MoinMoin/storage/backends/sqla.py	Tue Mar 08 19:34:42 2011 +0100
+++ b/MoinMoin/storage/backends/sqla.py	Fri Mar 11 03:52:14 2011 +0100
@@ -622,6 +622,7 @@
         Write the given amount of data.
         """
         self._data.write(data)
+        self._size = self._data.size
 
     def read(self, amount=None):
         """
@@ -650,10 +651,6 @@
     def __setitem__(self, key, value):
         NewRevision.__setitem__(self, key, value)
 
-    @property
-    def size(self):
-        return self._data.size
-
     def destroy(self):
         """
         @see: Backend.Revision.destroy.__doc__