Browse Source

Initial Commit

Billy Barrow 3 năm trước cách đây
commit
15df931a9c
13 tập tin đã thay đổi với 586 bổ sung0 xóa
  1. 1 0
      .gitignore
  2. 10 0
      README.md
  3. 19 0
      asset.py
  4. 36 0
      asset_index.py
  5. 33 0
      metadata.py
  6. 75 0
      ppix.py
  7. 187 0
      ppix_builder.py
  8. 54 0
      ppub.py
  9. 46 0
      ppub_builder.py
  10. 60 0
      spec.md
  11. BIN
      spec.ppub
  12. 47 0
      test.py
  13. 18 0
      word_tree.py

+ 1 - 0
.gitignore

@@ -0,0 +1 @@
+*.pyc

+ 10 - 0
README.md

@@ -0,0 +1,10 @@
+# PyPPUB
+*Portable PUBlications for Python*
+
+Contained in this repo is the spec for a format I made up one day for some reason called PPUB, or Portable PUBlication. I made it because I thought I wanted to blog but I was having one of those "hating the state of the modern web" weeks. PPUB is basically an archive with publication metadata baked in. It allows markdown files along with any embedded content to be wrapped up into a single compressed file.
+
+PPIX is an indexing format which allows for searching through a library of PPUB files quickly and easily. A companion project called php-ppub can be configured to read PPIX files and expose a search functionality to visitors of your php-ppub blog site.
+
+The idea was to make posts easily archivable, uploadable, and distribuitable. Maybe one day this format will still be useful when we abandon the concept that is the "modern web". Or life will go on and this will be abandoned. But hey, I'm sitting here right now publishing this to GitHub and actually putting in the effort to read the words you are reading right now, so who knows. I might even actually start blogging.
+
+A [UI is available](https://github.com/Tilo15/PPublisher) in GTK3 which makes use of this library. Everything needs to be cleaned up and packaged at some point though.

+ 19 - 0
asset.py

@@ -0,0 +1,19 @@
+
+class Asset:
+
+    def __init__(self, path, mimetype, start_location, end_location, flags):
+        self.path = path
+        self.mimetype = mimetype
+        self.start_location = start_location
+        self.end_location = end_location
+        self.flags = flags
+
+    @staticmethod
+    def from_string(string):
+        path, data = string.split(": ", 1)
+        data = str.split(data.rstrip(), " ")
+        asset = Asset(path, data[0], int(data[1]), int(data[2]), data[3:])
+        return asset
+
+    def __str__(self) -> str:
+        return str.format("{}: {} {} {} {}", self.path, self.mimetype, self.start_location, self.end_location,  str.join(" ", self.flags))

+ 36 - 0
asset_index.py

@@ -0,0 +1,36 @@
+from PyPPUB import asset
+
+class AssetIndex:
+
+    def __init__(self):
+        self.__assets = {}
+        self.__assets_ordered = []
+    
+    def has_asset(self, path):
+        return path in self.__assets
+
+    def get_asset(self, path):
+        return self.__assets[path]
+
+    def add_asset(self, asset):
+        self.__assets_ordered.append(asset)
+        self.__assets[asset.path] = asset
+
+    def get_nth_asset(self, n):
+        return self.__assets_ordered[n]
+
+    def asset_count(self):
+        return len(self.__assets_ordered)
+
+    @staticmethod
+    def from_string(string):
+        lines = string.split("\n")[:-1]
+        assets_ordered = [asset.Asset.from_string(x) for x in lines]
+        assets = {x.path: x for x in assets_ordered}
+        index = AssetIndex()
+        index.__assets_ordered = assets_ordered
+        index.__assets = assets
+        return index
+
+    def __str__(self) -> str:
+        return str.join("\n", (str(x) for x in self.__assets_ordered)) + "\n"

+ 33 - 0
metadata.py

@@ -0,0 +1,33 @@
+
+
+from os import stat
+
+
+class Metadata:
+
+    def __init__(self):
+        self.__data = {}
+
+    def get_value(self, field_name):
+        if(field_name in self.__data):
+            return self.__data[field_name]
+        else:
+            return None
+
+    def set_value(self, field_name, value):
+        self.__data[field_name] = value
+
+    @staticmethod
+    def from_string(string):
+        entries = string.split("\n")[:-1]
+        data = {x[0]: x[1] for x in (y.split(": ") for y in entries)}
+        metadata = Metadata()
+        metadata.__data = data
+        return metadata
+
+    def __str__(self) -> str:
+        data = ""
+        print(self.__data)
+        for key, value in self.__data.items():
+            data += "{}: {}\n".format(key, value)
+        return data

+ 75 - 0
ppix.py

@@ -0,0 +1,75 @@
+import struct
+import typing
+
+class Ppix:
+
+    def __init__(self, stream):
+        self.__stream = stream
+        stream.seek(0)
+        if(stream.read(5) != b"PPIX\x00"):
+            raise Exception("Stream does not begin with PPIX magic number")
+
+        self.__publication_index_location, self.__collection_index_location, self.__tag_index_location, self.__word_tree_root_location = struct.unpack("<IIII", stream.read(16))
+
+    def get_publications_count(self) -> int:
+        self.__stream.seek(self.__publication_index_location)
+        return struct.unpack("<I", self.__stream.read(4))[0]
+
+    def get_publication_by_id(self, id) -> str:
+        position = self.__publication_index_location + 4 + (id * 6)
+        self.__stream.seek(position)
+        string_location, string_length = struct.unpack("<IH", self.__stream.read(6))
+        self.__stream.seek(string_location)
+        return self.__stream.read(string_length).decode("utf-8")
+
+    def get_collection_by_id(self, id) -> typing.List[int]:
+        position = self.__collection_index_location + (id * 6)
+        self.__stream.seek(position)
+        collection_location, collection_item_count = struct.unpack("<IH", self.__stream.read(6))
+        self.__stream.seek(collection_location)
+        return struct.unpack("<{}".format("I"*collection_item_count), self.__stream.read(collection_item_count * 4))
+
+    def get_tags_count(self) -> int:
+        self.__stream.seek(self.__tag_index_location)
+        return struct.unpack("<H", self.__stream.read(2))[0]
+
+    def get_tags(self):
+        count = self.get_tags_count()
+        for i in range(count):
+            tag_string_length, collection_id = struct.unpack("<BI", self.__stream.read(5))
+            yield (self.__stream.read(tag_string_length).decode('utf-8'), collection_id)
+
+    def find_word_matches(self, word):
+        node = self.__get_word_node_from_string(word)
+        return node[2] if node != None else None
+        
+    def __get_word_node_from_string(self, word):
+        bin_string = self.__string_to_bin(word)
+        node = self.__read_tree_node(self.__word_tree_root_location)
+
+        for bit in bin_string:
+            if(bit == "0" and node[0] != 0):
+                node = self.__read_tree_node(node[0])
+            elif(bit == "1" and node[1] != 0):
+                node = self.__read_tree_node(node[1])
+            else:
+                return None
+    
+        return node
+
+    def __read_tree_node(self, position):
+        self.__stream.seek(position)
+        c0, has_col, col, c1 = struct.unpack("<IBII", self.__stream.read(13))
+        if(has_col != 255):
+            col == None
+
+        return (c0, c1, col)
+
+    def __string_to_bin(self, string):
+        data = string.encode("utf-8")
+        array = []
+        for byte in data:
+            for i in [1,2,4,8,16,32,64,128]:
+                array.append(byte & i == i)
+
+        return "".join(("1" if x else "0" for x in array))

+ 187 - 0
ppix_builder.py

@@ -0,0 +1,187 @@
+from PyPPUB import ppub
+from PyPPUB import word_tree
+import struct
+
+class PpixBuilder:
+
+    def __init__(self):
+        self.words = {}
+        self.collections = []
+        self.tags = {}
+        self.pubs = []
+
+        self.word_tree = word_tree.WordBit()
+
+    def add_publication(self, path, pub: ppub.Ppub):
+        index = len(self.pubs)
+        self.pubs.append((path, pub))
+
+        pub_tags = pub.metadata.get_value("tags")
+        if (pub_tags != None):
+            for tag in pub_tags.split(" "):
+                if(tag in self.tags):
+                    self.collections[self.tags[tag]].add(index)
+                else:
+                    collection_index = len(self.collections)
+                    self.collections.append(set([index,]))
+                    self.tags[tag] = collection_index
+        
+        words = set()
+        def add_to_set(word_soup):
+            if(word_soup == None):
+                return
+
+            stripped = "".join((x if x.isalnum() else " ") for x in word_soup)
+            for word in stripped.split(" "):
+                words.add(word.lower())
+
+        add_to_set(pub.metadata.get_value("title"))
+        add_to_set(pub.metadata.get_value("description"))
+        add_to_set(pub.metadata.get_value("author"))
+        add_to_set(pub.get_asset_data(pub.default_asset).decode('utf-8').replace("\n", " "))
+        words.remove("")
+        print(words)
+
+        for word in words:
+            if(word in self.words):
+                    self.collections[self.words[word]].add(index)
+            else:
+                collection_index = len(self.collections)
+                self.collections.append(set([index,]))
+                self.words[word] = collection_index
+
+
+    def write_out(self, stream):
+        # Magic number
+        stream.write(b"PPIX\x00")
+        start = 21
+
+        publication_index_start = start
+        publication_index = self.serialise_publication_index(start)
+        start += len(publication_index)
+
+        collection_index_start = start
+        collection_index = self.serialise_collections(start)
+        start += len(collection_index)
+
+        tag_index_start = start
+        tag_index = self.serialise_tags()
+        start += len(tag_index)
+
+        stream.write(struct.pack("<IIII", publication_index_start, collection_index_start, tag_index_start, start))
+        stream.write(publication_index)
+        stream.write(collection_index)
+        stream.write(tag_index)
+        
+        self.serialise_word_tree(start, stream)
+        stream.flush()
+        stream.close()
+
+
+    def serialise_publication_index(self, start_position):
+        data = struct.pack("<I", len(self.pubs))
+        string_data_start = start_position + 4 + (len(self.pubs) * 6)
+        string_data = b""
+
+        for pub in self.pubs:
+            encoded = pub[0].encode('utf-8')
+            data += struct.pack("<IH", string_data_start + len(string_data), len(encoded))
+            string_data += encoded
+        
+        return data + string_data
+
+    def serialise_collections(self, start_position):
+        index_data = b""
+        collection_data_start = start_position + (len(self.collections) * 6)
+        collection_data = b""
+
+        for col in self.collections:
+            index_data += struct.pack("<IH", collection_data_start + len(collection_data), len(col))
+            for pub_id in col:
+                collection_data += struct.pack("<I", pub_id)
+
+        
+        return index_data + collection_data
+
+    def serialise_tags(self):
+        data = struct.pack("<H", len(self.tags))
+
+        for key, value in self.tags.items():
+            encoded = key.encode("utf-8")
+            data += struct.pack("<BI", len(encoded), value)
+            data += encoded
+
+        return data
+
+    def serialise_word_tree(self, start_position, stream):
+        words = sorted(((self.string_to_bool_array(k), v) for k, v in self.words.items()), key=lambda x: x[0][0])
+        root = word_tree.WordBit()
+        nodes = {"": root}
+
+        for word in words:
+            last_bit = None
+            for i in range(len(word[0][0])):
+                key = word[0][0][:i+1]
+                if(key in nodes):
+                    last_bit = key
+                    continue
+
+                last_bit = word_tree.WordBit()
+                past_key = word[0][0][:i]
+                if(word[0][1][i]):
+                    nodes[past_key].next_1 = last_bit
+                else:
+                    nodes[past_key].next_0 = last_bit
+                nodes[key] = last_bit
+        
+            last_bit.collection = word[1]
+
+        root.position = start_position
+        node_array = [root,]
+        del nodes[""]
+
+        counter = root.position + word_tree.WordBit.SIZE
+        for node in nodes.values():
+            node.position = counter
+            node_array.append(node)
+            counter += word_tree.WordBit.SIZE
+
+        for node in node_array:
+            stream.write(node.serialise())
+        
+
+    def string_to_bool_array(self, string):
+        data = string.encode("utf-8")
+        array = []
+        for byte in data:
+            for i in [1,2,4,8,16,32,64,128]:
+                array.append(byte & i == i)
+
+        return ("".join(("1" if x else "0" for x in array)), array)
+
+
+
+if(__name__ == "__main__"):
+    a = PpixBuilder()
+    import glob
+    paths = glob.glob("ppubs/*.ppub")
+    for path in paths:
+        a.add_publication(path.split("/")[-1], ppub.Ppub.from_stream(open(path, 'rb')))
+
+    f = open("lib.ppix", 'wb')
+    a.write_out(f)
+
+    import ppix
+    f = open("lib.ppix", 'rb')
+    ix = ppix.Ppix(f)
+    print("{} publication(s)".format(ix.get_publications_count()))
+    print("{} tag(s)".format(ix.get_tags_count()))
+    for tag in ix.get_tags():
+        print(tag);
+    word = "ethics"
+    col = ix.find_word_matches(word)
+    if(col != None):
+        print("The following publications contain the word '{}'".format(word))
+        for pub_id in ix.get_collection_by_id(col):
+            print(ix.get_publication_by_id(pub_id))
+    

+ 54 - 0
ppub.py

@@ -0,0 +1,54 @@
+from PyPPUB.asset_index import AssetIndex
+from PyPPUB.metadata import Metadata
+import gzip
+
+class Ppub:
+
+    def __init__(self):
+        self.metadata = None
+        self.asset_index = None
+        self.default_asset = None
+        self.__stream = None
+        self.__blob_start = 0
+        self.__flag_handlers = {
+            "gzip": lambda x: gzip.decompress(x)
+        }
+
+    @staticmethod
+    def from_stream(stream):
+        if (stream.read(5) != b"ppub\n"):
+            raise Exception("Stream did not start with magic number")
+
+        index_length_bytes = b""
+        next_byte = b""
+        while(next_byte != b"\n"):
+            index_length_bytes += next_byte
+            next_byte = stream.read(1)
+
+        index_length = int(index_length_bytes)
+        index_bytes = stream.read(index_length)
+        blob_start = len(index_length_bytes) + index_length + 6
+
+        index = AssetIndex.from_string(index_bytes.decode('utf-8'))
+
+        obj = Ppub()
+        obj.__stream = stream
+        obj.__blob_start = blob_start
+        obj.asset_index = index
+        obj.metadata = Metadata.from_string(obj.get_asset_data(index.get_asset("metadata")).decode('utf-8'))
+        obj.default_asset = index.get_nth_asset(1)
+        return obj
+
+    def get_asset_data(self, asset):
+        start_location = asset.start_location + self.__blob_start
+        length = asset.end_location - asset.start_location
+        self.__stream.seek(start_location)
+        data = self.__stream.read(length)
+        for flag in asset.flags:
+            if (flag not in self.__flag_handlers):
+                raise Exception("Flag '%s' not understood" % flag)
+            data = self.__flag_handlers[flag](data)
+        return data
+
+    
+

+ 46 - 0
ppub_builder.py

@@ -0,0 +1,46 @@
+from PyPPUB.asset import Asset
+from PyPPUB.asset_index import AssetIndex
+from PyPPUB.metadata import Metadata
+
+import gzip
+
+class PpubBuilder:
+
+    def __init__(self):
+        self.assets = []
+        self.metadata = Metadata()
+
+
+    def add_asset(self, path, mimetype, data, flags = None):
+        self.assets.append(BuilderAsset(path, mimetype, data, flags))
+
+    def write_to_stream(self, stream):
+        stream.write(b"ppub\n")
+        data_blob = b""
+        index = AssetIndex()
+        assets = [BuilderAsset("metadata", "application/x-ppub-metadata", str(self.metadata).encode('utf-8')),] + self.assets
+        for builder_asset in assets:
+            start_location = len(data_blob)
+            asset_data = builder_asset.data
+            asset_data_gzip = gzip.compress(asset_data, 9)
+            if(len(asset_data) > len(asset_data_gzip)):
+                asset_data = asset_data_gzip
+                builder_asset.flags.append("gzip")
+            data_blob += asset_data
+            end_location = len(data_blob)
+            asset = Asset(builder_asset.path, builder_asset.mimetype, start_location, end_location, builder_asset.flags)
+            index.add_asset(asset)
+        
+        index_data = str(index)
+        stream.write(str.format("{}\n", len(index_data)).encode('utf-8'))
+        stream.write(index_data.encode('utf-8'))
+        stream.write(data_blob)
+
+class BuilderAsset:
+    def __init__(self, path, mimetype, data, flags = None):
+        self.path = path
+        self.mimetype = mimetype
+        self.data = data
+        self.flags = flags
+        if(self.flags == None):
+            self.flags = []

+ 60 - 0
spec.md

@@ -0,0 +1,60 @@
+# PPUB - Portable PUBlication
+
+## Magic Number
+
+A PPUB file must start with the magic number `"ppub\n"`
+
+## Asset Index
+
+The asset index must follow the magic number. The first line of the asset index is the length in bytes (excluding the first line) of the asset index.
+
+Entries in the index are separated by newline `'\n'` characters.
+
+Each asset has a filename which may contain any character except newline and colon (`:`). Filenames are delemited by a colon and space `": "` character sequence.
+
+Each value following the `: ` sequence is terminated either by a space or the end of the entry (`'\n'`). The first values are:
+
+1. The mimetype of the asset
+2. The start of the asset's data (bytes relative to the end of the asset index)
+3. The end of the asset's data (bytes relative to the end of the asset index)
+
+Optional flags may be added after the 3rd value, such as `"gzip"` to indicate that the asset has been compressed with GZip.
+
+Entries containing optional flags not understood by the application should be ignored.
+
+Non-official application specific flags should be prefixed with `"x-"`.
+
+The officially supported flags are:
+
+* `gzip` specifies that the asset is compressed with GZip.
+* `licence` specifies that the asset is the licence that the publication is under.
+
+The first entry in the asset index **must** be the metadata object. The second entry in the asset index **must** be the initial markdown file to show (i.e. the "cover" of the publication).
+
+An example of an asset index could look like this:
+
+```
+121
+metadata: application/x-ppub-metadata 0 64
+PPUB Specification: text/markdown 64 548 gzip
+ppub-logo.png: image/png 548 720
+```
+
+## Metadata
+
+An asset named `"metadata"` **must** exist as the fist asset in a PPUB file with the mimetype `"application/x-ppub-metadata"`.
+
+The metadata object is a collection of field-values. The field name is delimited by a space character, and the field value is delimited by a newline character.
+
+Non-official metadata fields should be prefixed with `"x-"`. However it is encouraged to add application specific metadata by adding assets with application specific flags instead.
+
+All metadata fields are optional.
+
+The official metadata fields are:
+
+* `title` publication title, string.
+* `author` publication author, string with optional email address enclosed in angle brackets (e.g. `"John Doe <john@doe.com>"`)
+* `date` publication date, ISO 8601 format.
+* `description` a text blurb of the publication, string.
+* `tags` comma separated tags for the publication used for indexing, string.
+* `copyright` copyright information, string.

BIN
spec.ppub


+ 47 - 0
test.py

@@ -0,0 +1,47 @@
+import sys
+from PyPPUB import ppub_builder
+from PyPPUB import mimetypes
+import datetime
+
+builder = ppub_builder.PpubBuilder()
+
+def set_if_not_empty(field, question):
+    answer = input(question)
+    if(answer == ""):
+        return
+    builder.metadata.set_value(field, answer)
+
+set_if_not_empty("title", "[Metadata] Title? ")
+set_if_not_empty("tags", "[Metadata] Tags? ")
+set_if_not_empty("description", "[Metadata] Description? ")
+set_if_not_empty("author", "[Metadata] Author? ")
+set_if_not_empty("copyright", "[Metadata] Copyright? ")
+set_if_not_empty("licence", "[Metadata] Licence? ")
+builder.metadata.set_value("date", datetime.datetime.now().astimezone().isoformat())
+
+for arg in sys.argv[1:]:
+    print("Adding %s" % arg)
+    f = open(arg, 'rb')
+    builder.add_asset(arg, mimetypes.guess_type(arg)[0], f.read())
+    f.close()
+
+print("Writing output")
+f = open("output.ppub", 'wb')
+builder.write_to_stream(f)
+f.close()
+print("Complete")
+
+import ppub
+
+print("Reading")
+
+f = open("output.ppub", 'rb')
+pub = ppub.Ppub.from_stream(f)
+for i in range(pub.asset_index.asset_count()):
+    asset = pub.asset_index.get_nth_asset(i)
+    print("Extracting asset %s" % asset.path)
+    of = open("asset_%i" % i, 'wb')
+    of.write(pub.get_asset_data(asset))
+    of.close()
+
+print("Done")

+ 18 - 0
word_tree.py

@@ -0,0 +1,18 @@
+import struct
+
+class WordBit:
+
+    SIZE = 13
+
+    def __init__(self):
+        self.next_0 = None
+        self.next_1 = None
+        self.collection = None
+        self.position = 0
+
+    def serialise(self):
+        n0 = self.next_0.position if self.next_0 != None else 0
+        n1 = self.next_1.position if self.next_1 != None else 0
+        col = self.collection if self.collection != None else 0
+        has_col = 255 if self.collection != None else 0
+        return struct.pack("<IBII", n0, has_col, col, n1)