from PyPPUB import ppub from PyPPUB import word_tree import struct class PpixBuilder: def __init__(self): self.words = {} self.collections = [] self.tags = {} self.pubs = [] self.alternative_locations = [] self.word_tree = word_tree.WordBit() def add_publication(self, path, pub: ppub.Ppub): index = len(self.pubs) self.pubs.append((path, pub)) pub_tags = pub.metadata.get_value("tags") if (pub_tags != None): for tag in pub_tags.split(" "): if(tag in self.tags): self.collections[self.tags[tag]].add(index) else: collection_index = len(self.collections) self.collections.append(set([index,])) self.tags[tag] = collection_index words = set() def add_to_set(word_soup): if(word_soup == None): return stripped = "".join((x if x.isalnum() else " ") for x in word_soup) for word in stripped.split(" "): words.add(word.lower()) add_to_set(pub.metadata.get_value("title")) add_to_set(pub.metadata.get_value("description")) add_to_set(pub.metadata.get_value("author")) add_to_set(pub.get_asset_data(pub.default_asset).decode('utf-8').replace("\n", " ")) words.remove("") print(words) for word in words: if(word in self.words): self.collections[self.words[word]].add(index) else: collection_index = len(self.collections) self.collections.append(set([index,])) self.words[word] = collection_index def add_alternative_location(self, url): self.alternative_locations.append(url) def write_out(self, stream): # Magic number stream.write(b"PPIX\x00") start = 35 publication_index_start = start publication_index = self.serialise_publication_index(start) start += len(publication_index) collection_index_start = start collection_index = self.serialise_collections(start) start += len(collection_index) tag_index_start = start tag_index = self.serialise_tags() start += len(tag_index) alternative_location_index_start = start alternative_location_index = self.serialise_alternative_locations() start += len(alternative_location_index) stream.write(struct.pack("