ppix-gen.php 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271
  1. <?php
  2. include_once("ppub.php");
  3. class PpixGenerator {
  4. private $words = array();
  5. private $collections = array();
  6. private $tags = array();
  7. private $pubs = array();
  8. public function add_publication($path) {
  9. $ppub = new Ppub();
  10. $ppub->read_file($path);
  11. $index = count($this->pubs);
  12. array_push($this->pubs, [explode("/", $path, 2)[1], $ppub]);
  13. $tags = $ppub->metadata["tags"];
  14. if($tags != null) {
  15. foreach(explode(" ", $tags) as $tag) {
  16. if(array_key_exists($tag, $this->tags)) {
  17. array_push($collections[$this->tags[$tag]], $index);
  18. }
  19. else {
  20. $collection_index = count($this->collections);
  21. array_push($this->collections, [$index,]);
  22. $this->tags[$tag] = $collection_index;
  23. }
  24. }
  25. }
  26. $words = array();
  27. $words = $this->word_array_collect($words, $ppub->metadata["title"]);
  28. $words = $this->word_array_collect($words, $ppub->metadata["description"]);
  29. $words = $this->word_array_collect($words, $ppub->metadata["author"]);
  30. $default_asset = $ppub->asset_list[1];
  31. error_log("Default asset: " . $default_asset->mimetype);
  32. if(str_starts_with($default_asset->mimetype, "text/")) {
  33. $words = $this->word_array_collect($words, $ppub->read_asset($default_asset));
  34. }
  35. if($default_asset->mimetype == "application/x-ppvm") {
  36. include_once("ppvm.php");
  37. $ppvm = new Ppvm();
  38. $ppvm->from_string($ppub->read_asset($default_asset));
  39. $descripton = $ppub->read_asset($ppub->asset_index[$ppvm->metadata["description"]]);
  40. $words = $this->word_array_collect($words, $descripton);
  41. }
  42. foreach($words as $word => $_) {
  43. if(array_key_exists($word, $this->words)) {
  44. array_push($this->collections[$this->words[$word]], $index);
  45. }
  46. else {
  47. $collection_index = count($this->collections);
  48. array_push($this->collections, [$index,]);
  49. $this->words[$word] = $collection_index;
  50. }
  51. }
  52. }
  53. public function serialise() {
  54. $str = "PPIX\x00";
  55. $start = 21;
  56. $publication_index_start = $start;
  57. $publication_index = $this->serialise_publication_index($start);
  58. $start += strlen($publication_index);
  59. $collection_index_start = $start;
  60. $collection_index = $this->serialise_collections($start);
  61. $start += strlen($collection_index);
  62. $tag_index_start = $start;
  63. $tag_index = $this->serialise_tags();
  64. $start += strlen($tag_index);
  65. $str .= pack("VVVV", $publication_index_start, $collection_index_start, $tag_index_start, $start);
  66. $str .= $publication_index;
  67. $str .= $collection_index;
  68. $str .= $tag_index;
  69. $str .= $this->serialise_word_tree($start);
  70. return $str;
  71. }
  72. private function serialise_publication_index($start_pos) {
  73. $data = pack("V", count($this->pubs));
  74. $string_data_start = $start_pos + 4 + (count($this->pubs) * 6);
  75. $string_data = "";
  76. foreach($this->pubs as $pub) {
  77. $name = $pub[0];
  78. $data .= pack("Vv", $string_data_start + strlen($string_data), strlen($name));
  79. $string_data .= $name;
  80. }
  81. return $data . $string_data;
  82. }
  83. private function serialise_collections($start_pos) {
  84. $index_data = "";
  85. $collection_data_start = $start_pos + (count($this->collections) * 6);
  86. $collection_data = "";
  87. foreach($this->collections as $dup_col) {
  88. $col = array_unique($dup_col);
  89. $index_data .= pack("Vv", $collection_data_start + strlen($collection_data), count($col));
  90. foreach($col as $pub_id) {
  91. $collection_data .= pack("V", $pub_id);
  92. }
  93. }
  94. return $index_data . $collection_data;
  95. }
  96. private function serialise_tags() {
  97. $data = pack("v", count($this->tags));
  98. foreach($this->tags as $tag => $col_id) {
  99. $data .= pack("CV", strlen($tag), $col_id);
  100. $data .= $tag;
  101. }
  102. return $data;
  103. }
  104. function serialise_word_tree($start_position) {
  105. $words = array();
  106. foreach ($this->words as $k => $v) {
  107. array_push($words, [$this->str_to_bool_array($k), $v]);
  108. }
  109. usort($words, function($a, $b) {
  110. return $a[0][0] <=> $b[0][0];
  111. });
  112. $root = new WordBit();
  113. $nodes = ["" => $root];
  114. foreach ($words as $word) {
  115. $last_bit = null;
  116. for ($i = 0; $i < strlen($word[0][0]); $i++) {
  117. $key = substr($word[0][0], 0, $i + 1);
  118. if (array_key_exists($key, $nodes)) {
  119. $last_bit = $nodes[$key];
  120. continue;
  121. }
  122. $last_bit = new WordBit();
  123. $past_key = substr($word[0][0], 0, $i);
  124. if ($word[0][1][$i]) {
  125. $nodes[$past_key]->next_1 = $last_bit;
  126. } else {
  127. $nodes[$past_key]->next_0 = $last_bit;
  128. }
  129. $nodes[$key] = $last_bit;
  130. }
  131. $last_bit->collection = $word[1];
  132. }
  133. $root->position = $start_position;
  134. $node_array = [$root];
  135. unset($nodes[""]);
  136. $counter = $root->position + WordBit::SIZE;
  137. foreach ($nodes as $node) {
  138. $node->position = $counter;
  139. array_push($node_array, $node);
  140. $counter += WordBit::SIZE;
  141. }
  142. $output = "";
  143. foreach ($node_array as $node) {
  144. $output .= $node->serialise();
  145. }
  146. return $output;
  147. }
  148. // private function serialise_word_tree($start_pos) {
  149. // $data = "";
  150. // $words = array_multisort(
  151. // array_map(function($k, $v) {
  152. // return [$this->str_to_bool_array($k), $v];
  153. // }, array_keys($this->words), $this->words),
  154. // SORT_ASC,
  155. // array_column($words, 0, 0)
  156. // );
  157. // $root = new WordBit();
  158. // $nodes = ["" => $root];
  159. // foreach($words as $word) {
  160. // $last_bit = null;
  161. // for($i = 0; $ < count($word[0][0]); $i++){
  162. // $key = $word[0][0]
  163. // }
  164. // }
  165. // }
  166. private function str_to_bool_array($string) {
  167. $array = array();
  168. foreach(str_split($string) as $char) {
  169. foreach([1,2,4,8,16,32,64,128] as $i) {
  170. array_push($array, (ord($char) & $i) == $i);
  171. }
  172. }
  173. $string = "";
  174. foreach($array as $bit) {
  175. $string .= $bit ? 1 : 0;
  176. }
  177. return [$string, $array];
  178. }
  179. private function word_array_collect($array, $word_soup) {
  180. if($word_soup == null) {
  181. return;
  182. }
  183. $stripped = preg_replace("/[^[:alnum:][:space:]]/u", '', $word_soup);
  184. $words = explode(" ", $stripped);
  185. foreach($words as $word) {
  186. if($word != "") {
  187. $array[strtolower($word)] = 1;
  188. }
  189. }
  190. return $array;
  191. }
  192. }
  193. class WordBit {
  194. public const SIZE = 13;
  195. public $next_0;
  196. public $next_1;
  197. public $collection;
  198. public $position;
  199. public function serialise() {
  200. $n0 = $this->next_0 == null ? 0 : $this->next_0->position;
  201. $n1 = $this->next_1 == null ? 0 : $this->next_1->position;
  202. $col = $this->collection == null ? 0 : $this->collection;
  203. $has_col = $col == 0 ? 0 : 255;
  204. return pack("VCVV", $n0, $has_col, $col, $n1);
  205. }
  206. }
  207. function generate_ppix_from_ppcl() {
  208. include_once("config.php");
  209. include_once("ppcl.php");
  210. $ppcl = new Ppcl();
  211. $ppcl->from_string(file_get_contents(PUBLICATION_DIR . "/collection.ppcl"));
  212. $pubs = array_reverse($ppcl->publications);
  213. $ppix = new PpixGenerator();
  214. foreach($pubs as $pub) {
  215. $ppix->add_publication(PUBLICATION_DIR . "/" . $pub->name);
  216. }
  217. file_put_contents(PUBLICATION_DIR . "/lib.ppix", $ppix->serialise());
  218. }
  219. ?>