Tilo15
/
PyPPUB


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100
							import struct
import typing

class Ppix:

    def __init__(self, stream):
        self.__stream = stream
        stream.seek(0)
        if(stream.read(5) != b"PPIX\x00"):
            raise Exception("Stream does not begin with PPIX magic number")

        self.__publication_index_location, self.__collection_index_location, self.__tag_index_location, self.__word_tree_root_location = struct.unpack("<IIII", stream.read(16))

        # Check for extended collection metadata
        self.__collection_alternatives_location = 0
        self.__extended_collection_data_location_count = 0
        if(stream.read(8) == b"ECMDATA\x00"):
            self.__extended_collection_data_location_count = struct.unpack("<H", self.__stream.read(2))[0]

            if(self.__extended_collection_data_location_count > 0):
                self.__collection_alternatives_location = struct.unpack("<I", self.__stream.read(4))[0]

    def get_publications_count(self) -> int:
        self.__stream.seek(self.__publication_index_location)
        return struct.unpack("<I", self.__stream.read(4))[0]

    def get_publication_by_id(self, id) -> str:
        position = self.__publication_index_location + 4 + (id * 6)
        self.__stream.seek(position)
        string_location, string_length = struct.unpack("<IH", self.__stream.read(6))
        self.__stream.seek(string_location)
        return self.__stream.read(string_length).decode("utf-8")

    def get_collection_by_id(self, id) -> typing.List[int]:
        position = self.__collection_index_location + (id * 6)
        self.__stream.seek(position)
        collection_location, collection_item_count = struct.unpack("<IH", self.__stream.read(6))
        self.__stream.seek(collection_location)
        return struct.unpack("<{}".format("I"*collection_item_count), self.__stream.read(collection_item_count * 4))

    def get_tags_count(self) -> int:
        self.__stream.seek(self.__tag_index_location)
        return struct.unpack("<H", self.__stream.read(2))[0]

    def get_tags(self):
        count = self.get_tags_count()
        for i in range(count):
            tag_string_length, collection_id = struct.unpack("<BI", self.__stream.read(5))
            yield (self.__stream.read(tag_string_length).decode('utf-8'), collection_id)

    def get_alternative_locations_count(self) -> int:
        if(self.__collection_alternatives_location == 0):
            return 0

        self.__stream.seek(self.__collection_alternatives_location)
        return struct.unpack("<H", self.__stream.read(2))[0]

    def get_alternative_locations(self):
        if(self.__collection_alternatives_location == 0):
            return []

        count = self.get_alternative_locations_count()
        for i in range(count):
            url_length = struct.unpack("<H", self.__stream.read(2))
            yield self.__stream.read(url_length).decode('utf-8')

    def find_word_matches(self, word):
        node = self.__get_word_node_from_string(word)
        return node[2] if node != None else None
        
    def __get_word_node_from_string(self, word):
        bin_string = self.__string_to_bin(word)
        node = self.__read_tree_node(self.__word_tree_root_location)

        for bit in bin_string:
            if(bit == "0" and node[0] != 0):
                node = self.__read_tree_node(node[0])
            elif(bit == "1" and node[1] != 0):
                node = self.__read_tree_node(node[1])
            else:
                return None
    
        return node

    def __read_tree_node(self, position):
        self.__stream.seek(position)
        c0, has_col, col, c1 = struct.unpack("<IBII", self.__stream.read(13))
        if(has_col != 255):
            col == None

        return (c0, c1, col)

    def __string_to_bin(self, string):
        data = string.encode("utf-8")
        array = []
        for byte in data:
            for i in [1,2,4,8,16,32,64,128]:
                array.append(byte & i == i)

        return "".join(("1" if x else "0" for x in array))