Explorar o código

Add extended collection metadata and alternatives to index

Billy Barrow %!s(int64=2) %!d(string=hai) anos
pai
achega
20191bcbc4
Modificáronse 2 ficheiros con 48 adicións e 1 borrados
  1. 25 0
      ppix.py
  2. 23 1
      ppix_builder.py

+ 25 - 0
ppix.py

@@ -11,6 +11,15 @@ class Ppix:
 
         self.__publication_index_location, self.__collection_index_location, self.__tag_index_location, self.__word_tree_root_location = struct.unpack("<IIII", stream.read(16))
 
+        # Check for extended collection metadata
+        self.__collection_alternatives_location = 0
+        self.__extended_collection_data_location_count = 0
+        if(stream.read(8) == b"ECMDATA\x00"):
+            self.__extended_collection_data_location_count = struct.unpack("<H", self.__stream.read(2))[0]
+
+            if(self.__extended_collection_data_location_count > 0):
+                self.__collection_alternatives_location = struct.unpack("<I", self.__stream.read(4))[0]
+
     def get_publications_count(self) -> int:
         self.__stream.seek(self.__publication_index_location)
         return struct.unpack("<I", self.__stream.read(4))[0]
@@ -39,6 +48,22 @@ class Ppix:
             tag_string_length, collection_id = struct.unpack("<BI", self.__stream.read(5))
             yield (self.__stream.read(tag_string_length).decode('utf-8'), collection_id)
 
+    def get_alternative_locations_count(self) -> int:
+        if(self.__collection_alternatives_location == 0):
+            return 0
+
+        self.__stream.seek(self.__collection_alternatives_location)
+        return struct.unpack("<H", self.__stream.read(2))[0]
+
+    def get_alternative_locations(self):
+        if(self.__collection_alternatives_location == 0):
+            return []
+
+        count = self.get_alternative_locations_count()
+        for i in range(count):
+            url_length = struct.unpack("<H", self.__stream.read(2))
+            yield self.__stream.read(url_length).decode('utf-8')
+
     def find_word_matches(self, word):
         node = self.__get_word_node_from_string(word)
         return node[2] if node != None else None

+ 23 - 1
ppix_builder.py

@@ -9,6 +9,7 @@ class PpixBuilder:
         self.collections = []
         self.tags = {}
         self.pubs = []
+        self.alternative_locations = []
 
         self.word_tree = word_tree.WordBit()
 
@@ -51,10 +52,14 @@ class PpixBuilder:
                 self.words[word] = collection_index
 
 
+    def add_alternative_location(self, url):
+        self.alternative_locations.append(url)
+
+
     def write_out(self, stream):
         # Magic number
         stream.write(b"PPIX\x00")
-        start = 21
+        start = 35
 
         publication_index_start = start
         publication_index = self.serialise_publication_index(start)
@@ -68,10 +73,17 @@ class PpixBuilder:
         tag_index = self.serialise_tags()
         start += len(tag_index)
 
+        alternative_location_index_start = start
+        alternative_location_index = self.serialise_alternative_locations()
+        start += len(alternative_location_index)
+
         stream.write(struct.pack("<IIII", publication_index_start, collection_index_start, tag_index_start, start))
+        stream.write(b"ECMDATA\x00")
+        stream.write(struct.pack("<HI", 1, alternative_location_index_start))
         stream.write(publication_index)
         stream.write(collection_index)
         stream.write(tag_index)
+        stream.write(alternative_location_index)
         
         self.serialise_word_tree(start, stream)
         stream.flush()
@@ -113,6 +125,16 @@ class PpixBuilder:
 
         return data
 
+    def serialise_alternative_locations(self):
+        data = struct.pack("<H", len(self.alternative_locations))
+
+        for location in self.alternative_locations:
+            encoded = location.encode("utf-8")
+            data += struct.pack("<H", len(encoded))
+            data += encoded
+
+        return data
+
     def serialise_word_tree(self, start_position, stream):
         words = sorted(((self.string_to_bool_array(k), v) for k, v in self.words.items()), key=lambda x: x[0][0])
         root = word_tree.WordBit()