123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271 |
- <?php
- include_once("ppub.php");
- class PpixGenerator {
- private $words = array();
- private $collections = array();
- private $tags = array();
- private $pubs = array();
- public function add_publication($path) {
- $ppub = new Ppub();
- $ppub->read_file($path);
- $index = count($this->pubs);
- array_push($this->pubs, [explode("/", $path, 2)[1], $ppub]);
- $tags = $ppub->metadata["tags"];
- if($tags != null) {
- foreach(explode(" ", $tags) as $tag) {
- if(array_key_exists($tag, $this->tags)) {
- array_push($this->collections[$this->tags[$tag]], $index);
- }
- else {
- $collection_index = count($this->collections);
- array_push($this->collections, [$index,]);
- $this->tags[$tag] = $collection_index;
- }
- }
- }
- $words = array();
- $words = $this->word_array_collect($words, $ppub->metadata["title"]);
- $words = $this->word_array_collect($words, $ppub->metadata["description"]);
- $words = $this->word_array_collect($words, $ppub->metadata["author"]);
- $default_asset = $ppub->asset_list[1];
- error_log("Default asset: " . $default_asset->mimetype);
- if(substr($default_asset->mimetype, 0, 5) == "text/") {
- $words = $this->word_array_collect($words, $ppub->read_asset($default_asset));
- }
- if($default_asset->mimetype == "application/x-ppvm") {
- include_once("ppvm.php");
- $ppvm = new Ppvm();
- $ppvm->from_string($ppub->read_asset($default_asset));
- $descripton = $ppub->read_asset($ppub->asset_index[$ppvm->metadata["description"]]);
- $words = $this->word_array_collect($words, $descripton);
- }
- foreach($words as $word => $_) {
- if(array_key_exists($word, $this->words)) {
- array_push($this->collections[$this->words[$word]], $index);
- }
- else {
- $collection_index = count($this->collections);
- array_push($this->collections, [$index,]);
- $this->words[$word] = $collection_index;
- }
- }
- }
- public function serialise() {
- $str = "PPIX\x00";
- $start = 21;
- $publication_index_start = $start;
- $publication_index = $this->serialise_publication_index($start);
- $start += strlen($publication_index);
- $collection_index_start = $start;
- $collection_index = $this->serialise_collections($start);
- $start += strlen($collection_index);
- $tag_index_start = $start;
- $tag_index = $this->serialise_tags();
- $start += strlen($tag_index);
- $str .= pack("VVVV", $publication_index_start, $collection_index_start, $tag_index_start, $start);
- $str .= $publication_index;
- $str .= $collection_index;
- $str .= $tag_index;
- $str .= $this->serialise_word_tree($start);
- return $str;
- }
- private function serialise_publication_index($start_pos) {
- $data = pack("V", count($this->pubs));
- $string_data_start = $start_pos + 4 + (count($this->pubs) * 6);
- $string_data = "";
- foreach($this->pubs as $pub) {
- $name = $pub[0];
- $data .= pack("Vv", $string_data_start + strlen($string_data), strlen($name));
- $string_data .= $name;
- }
- return $data . $string_data;
- }
- private function serialise_collections($start_pos) {
- $index_data = "";
- $collection_data_start = $start_pos + (count($this->collections) * 6);
- $collection_data = "";
- foreach($this->collections as $dup_col) {
- $col = array_unique($dup_col);
- $index_data .= pack("Vv", $collection_data_start + strlen($collection_data), count($col));
- foreach($col as $pub_id) {
- $collection_data .= pack("V", $pub_id);
- }
- }
- return $index_data . $collection_data;
- }
- private function serialise_tags() {
- $data = pack("v", count($this->tags));
-
- foreach($this->tags as $tag => $col_id) {
- $data .= pack("CV", strlen($tag), $col_id);
- $data .= $tag;
- }
- return $data;
- }
- function serialise_word_tree($start_position) {
- $words = array();
- foreach ($this->words as $k => $v) {
- array_push($words, [$this->str_to_bool_array($k), $v]);
- }
-
- usort($words, function($a, $b) {
- return $a[0][0] <=> $b[0][0];
- });
-
- $root = new WordBit();
- $nodes = ["" => $root];
- foreach ($words as $word) {
- $last_bit = null;
- for ($i = 0; $i < strlen($word[0][0]); $i++) {
- $key = substr($word[0][0], 0, $i + 1);
- if (array_key_exists($key, $nodes)) {
- $last_bit = $nodes[$key];
- continue;
- }
-
- $last_bit = new WordBit();
- $past_key = substr($word[0][0], 0, $i);
- if ($word[0][1][$i]) {
- $nodes[$past_key]->next_1 = $last_bit;
- } else {
- $nodes[$past_key]->next_0 = $last_bit;
- }
- $nodes[$key] = $last_bit;
- }
-
- $last_bit->collection = $word[1];
- }
-
- $root->position = $start_position;
- $node_array = [$root];
- unset($nodes[""]);
-
- $counter = $root->position + WordBit::SIZE;
- foreach ($nodes as $node) {
- $node->position = $counter;
- array_push($node_array, $node);
- $counter += WordBit::SIZE;
- }
-
- $output = "";
- foreach ($node_array as $node) {
- $output .= $node->serialise();
- }
- return $output;
- }
-
- // private function serialise_word_tree($start_pos) {
- // $data = "";
- // $words = array_multisort(
- // array_map(function($k, $v) {
- // return [$this->str_to_bool_array($k), $v];
- // }, array_keys($this->words), $this->words),
- // SORT_ASC,
- // array_column($words, 0, 0)
- // );
- // $root = new WordBit();
- // $nodes = ["" => $root];
- // foreach($words as $word) {
- // $last_bit = null;
- // for($i = 0; $ < count($word[0][0]); $i++){
- // $key = $word[0][0]
- // }
- // }
- // }
- private function str_to_bool_array($string) {
- $array = array();
- foreach(str_split($string) as $char) {
- foreach([1,2,4,8,16,32,64,128] as $i) {
- array_push($array, (ord($char) & $i) == $i);
- }
- }
- $string = "";
- foreach($array as $bit) {
- $string .= $bit ? 1 : 0;
- }
- return [$string, $array];
- }
- private function word_array_collect($array, $word_soup) {
- if($word_soup == null) {
- return;
- }
- $stripped = preg_replace("/[^[:alnum:][:space:]]/u", '', $word_soup);
- $words = explode(" ", $stripped);
- foreach($words as $word) {
- if($word != "") {
- $array[strtolower($word)] = 1;
- }
- }
- return $array;
- }
- }
- class WordBit {
- public const SIZE = 13;
- public $next_0;
- public $next_1;
- public $collection;
- public $position;
- public function serialise() {
- $n0 = $this->next_0 == null ? 0 : $this->next_0->position;
- $n1 = $this->next_1 == null ? 0 : $this->next_1->position;
- $col = $this->collection == null ? 0 : $this->collection;
- $has_col = $col == 0 ? 0 : 255;
- return pack("VCVV", $n0, $has_col, $col, $n1);
- }
- }
- function generate_ppix_from_ppcl() {
- include_once("config.php");
- include_once("ppcl.php");
- $ppcl = new Ppcl();
- $ppcl->from_string(file_get_contents(PUBLICATION_DIR . "/collection.ppcl"));
-
- $pubs = array_reverse($ppcl->publications);
- $ppix = new PpixGenerator();
- foreach($pubs as $pub) {
- $ppix->add_publication(PUBLICATION_DIR . "/" . $pub->name);
- }
-
- file_put_contents(PUBLICATION_DIR . "/lib.ppix", $ppix->serialise());
- }
- ?>
|