summaryrefslogtreecommitdiffstats
path: root/src/sequitur.h
diff options
context:
space:
mode:
authorJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-05 11:43:56 +0200
committerJukka Ojanen <jukka.ojanen@linkotec.net>2014-11-05 11:43:56 +0200
commita0db4af6fe8f68a62cbf993871137d4cd341dfc5 (patch)
treefcdb7e4e3d71d68a9efda9ca73d3b6690630cf0b /src/sequitur.h
parentb4efe4fc9fa2485679c4f6e4a9963c99d791aa0b (diff)
downloadffts-a0db4af6fe8f68a62cbf993871137d4cd341dfc5.zip
ffts-a0db4af6fe8f68a62cbf993871137d4cd341dfc5.tar.gz
Import Sequitur algorithm from SFFT
Diffstat (limited to 'src/sequitur.h')
-rw-r--r--src/sequitur.h448
1 files changed, 448 insertions, 0 deletions
diff --git a/src/sequitur.h b/src/sequitur.h
new file mode 100644
index 0000000..7429115
--- /dev/null
+++ b/src/sequitur.h
@@ -0,0 +1,448 @@
+/*
+
+ This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+ Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+ Copyright (c) 2012, The University of Waikato
+
+ All rights reserved.
+
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ * Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ * Neither the name of the organization nor the
+ names of its contributors may be used to endorse or promote products
+ derived from this software without specific prior written permission.
+
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+ DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+*/
+
+typedef struct _sym_t {
+ int c;
+ struct _sym_t *pPrev, *pNext;
+ struct _seq_rule_t *r;
+ int offset;
+} sym_t;
+
+typedef struct _seq_rule_t {
+ int c;
+ sym_t *ss;
+ struct _seq_rule_t *pPrev, *pNext;
+ int count;
+ int length;
+} seq_rule_t;
+
+void sym_tail_insert(sym_t **ss, sym_t *s)
+{
+ if (!*ss) {
+ *ss = s;
+ s->pPrev = s->pNext = NULL;
+ } else {
+ while (*ss) {
+ s->pPrev = *ss;
+ ss = &(*ss)->pNext;
+ }
+
+ *ss = s;
+ }
+}
+
+sym_t* sym_init(int c)
+{
+ sym_t *s;
+
+ s = (sym_t*) malloc(sizeof(*s));
+ if (!s) {
+ return NULL;
+ }
+
+ s->c = c;
+ s->pPrev = s->pNext = NULL;
+ s->r = NULL;
+
+ return s;
+}
+
+sym_t* sym_init_from_sym(sym_t *s2)
+{
+ sym_t *s;
+
+ s = (sym_t*) malloc(sizeof(*s));
+ if (!s) {
+ return NULL;
+ }
+
+ s->c = s2->c;
+ s->pPrev = s->pNext = NULL;
+ s->r = s2->r;
+ s->offset = s2->offset;
+
+ return s;
+}
+
+seq_rule_t* seq_init_rule(int c)
+{
+ seq_rule_t *G;
+
+ G = (seq_rule_t *)malloc(sizeof(*G));
+ if (!G) {
+ return NULL;
+ }
+
+ G->c = c;
+ G->count = 2;
+ G->ss = NULL;
+ G->pPrev = NULL;
+ G->pNext = NULL;
+
+ return G;
+}
+
+seq_rule_t* seq_grammer_insert_new_rule(seq_rule_t *G, char r, sym_t *a, sym_t *b)
+{
+ sym_t *sa, *sb;
+
+ while (G->pNext) {
+ G = G->pNext;
+ }
+
+ G->pNext = seq_init_rule(r);
+ if (!G->pNext) {
+ return NULL;
+ }
+
+ sa = sym_init_from_sym(a);
+ if (!sa) {
+ goto cleanup_pnext;
+ }
+
+ sb = sym_init_from_sym(b);
+ if (!sb) {
+ goto cleanup_sa;
+ }
+
+ sb->offset = sb->offset - sa->offset;
+ sa->offset = 0;
+ sym_tail_insert(&G->pNext->ss, sa);
+ sym_tail_insert(&G->pNext->ss, sb);
+ return G->pNext;
+
+cleanup_sa:
+ free(sa);
+
+cleanup_pnext:
+ free(G->pNext);
+ G->pNext = NULL;
+
+ return NULL;
+}
+
+sym_t* sym_match_digram(sym_t *s, sym_t *term, sym_t *a, sym_t *b)
+{
+ while (s != term) {
+ if (s->c == a->c && s->pNext->c == b->c &&
+ s->pNext->offset - s->offset == b->offset-a->offset) {
+ return s;
+ }
+
+ s = s->pNext;
+ }
+
+ return NULL;
+}
+
+seq_rule_t* seq_match_digram(seq_rule_t *R, sym_t *a, sym_t *b)
+{
+ while (R) {
+ if (R->ss->c == a->c && R->ss->pNext->c == b->c &&
+ R->ss->pNext->offset - R->ss->offset == b->offset - a->offset) {
+ return R;
+ }
+
+ R = R->pNext;
+ }
+
+ return NULL;
+}
+
+sym_t* sym_tail(sym_t *s)
+{
+ while (s->pNext) {
+ s = s->pNext;
+ }
+
+ return s;
+}
+
+int sym_count(sym_t *s)
+{
+ int count = 0;
+
+ while (s) {
+ count++;
+ s = s->pNext;
+ }
+
+ return count;
+}
+
+sym_t* sym_copylist(sym_t *s)
+{
+ sym_t *head = NULL;
+ sym_t *prev = head;
+
+ while (s) {
+ sym_t *copy = sym_init_from_sym(s);
+ if (!copy) {
+ return NULL;
+ }
+
+ copy->pPrev = prev;
+
+ if (prev) {
+ prev->pNext = copy;
+ }
+
+ if (!head) {
+ head = copy;
+ }
+
+ prev = copy;
+ s = s->pNext;
+ }
+
+ return head;
+}
+
+void seq_enforce_uniqueness(seq_rule_t *G)
+{
+ seq_rule_t *R = G;//->pNext;
+ seq_rule_t **ppr = &G->pNext;
+
+ while (R) {
+ if (R == G || R->count > 1) {
+ sym_t *s = R->ss;
+ sym_t **pp = &R->ss;
+
+ while (s) {
+ if (s->r && s->r->count == 1) {
+ sym_t *temp_itr;
+
+ *pp = s->r->ss;
+
+ temp_itr = s->r->ss;
+ while (temp_itr) {
+ temp_itr->offset += s->offset;
+ temp_itr = temp_itr->pNext;
+ }
+
+ s->r->ss->pPrev = s->pPrev;
+ if (s->pNext) {
+ s->pNext->pPrev = sym_tail(s->r->ss);
+ }
+
+ sym_tail(s->r->ss)->pNext = s->pNext;
+ s = s->r->ss;
+ continue;
+ }
+
+ pp = &s->pNext;
+ s = s->pNext;
+ }
+
+ ppr = &R->pNext;
+ } else {
+ *ppr = R->pNext;
+ }
+
+ R = R->pNext;
+ }
+}
+
+void seq_merge_small_rules(seq_rule_t *G, int thresh)
+{
+ seq_rule_t *R = G;
+
+ while (R) {
+ if (sym_count(R->ss) <= thresh) {
+ //printf("count %d > %d for %d\n", sym_count(R->ss), thresh, R->c);
+ sym_t *s = R->ss;
+ sym_t **pp = &R->ss;
+
+ while (s) {
+ if (s->r) {
+ sym_t *copylist;
+ sym_t *copylist_itr;
+
+ s->r->count--;
+
+ copylist = sym_copylist(s->r->ss);
+ if (!copylist) {
+ return;
+ }
+
+ copylist_itr = copylist;
+ while (copylist_itr) {
+ copylist_itr->offset += s->offset;
+ copylist_itr = copylist_itr->pNext;
+ }
+
+ *pp = copylist;
+ copylist->pPrev = s->pPrev;
+ if (s->pNext) {
+ s->pNext->pPrev = sym_tail(copylist);
+ }
+
+ sym_tail(copylist)->pNext = s->pNext;
+ pp = &(sym_tail(copylist)->pNext);
+ s = sym_tail(copylist)->pNext;
+ continue;
+ }
+
+ pp = &s->pNext;
+ s = s->pNext;
+ }
+ }
+
+ R = R->pNext;
+ }
+
+ seq_enforce_uniqueness(G);
+}
+
+void seq_extract_hierarchy(seq_rule_t *G)
+{
+ int next_rule = -2;
+ sym_t *cursym = G->ss;
+
+ while (cursym) {
+ sym_t *m = NULL;
+ seq_rule_t *mr = NULL;
+
+ if (cursym->pPrev && cursym->pPrev->pPrev) {
+ mr = seq_match_digram(G->pNext, cursym->pPrev, cursym);
+ if (mr) {
+ if (cursym->pPrev->r) {
+ cursym->pPrev->r->count--;
+ }
+
+ if(cursym->r) {
+ cursym->r->count--;
+ }
+
+ mr->count++;
+
+ cursym->pPrev->r = mr;
+ cursym->pPrev->c = mr->c;
+ cursym->pPrev->pNext = cursym->pNext;
+ cursym->pNext->pPrev = cursym->pPrev;
+ cursym = cursym->pPrev;
+ }
+
+ m = sym_match_digram(G->ss, cursym->pPrev->pPrev, cursym->pPrev, cursym);
+ if (m) {
+ seq_rule_t *newr;
+
+ if (cursym->pPrev->r) {
+ cursym->pPrev->r->count--;
+ }
+
+ if (cursym->r) {
+ cursym->r->count--;
+ }
+
+ newr = seq_grammer_insert_new_rule(G, next_rule, m, m->pNext);
+ if (!newr) {
+ return;
+ }
+
+ m->r = newr;
+ m->c = next_rule;
+ m->pNext = m->pNext->pNext;
+ m->pNext->pPrev = m;
+
+ cursym->pPrev->r = newr;
+ cursym->pPrev->c = next_rule;
+ cursym->pPrev->pNext = cursym->pNext;
+ cursym->pNext->pPrev = cursym->pPrev;
+ cursym = cursym->pPrev;
+
+ next_rule--;
+ }
+ }
+
+ if (!m && !mr) {
+ cursym = cursym->pNext;
+ }
+ }
+
+ seq_enforce_uniqueness(G);
+ seq_merge_small_rules(G, 2);
+// seq_enforce_uniqueness(G);
+}
+
+void seq_compute_lengths(seq_rule_t *G)
+{
+ seq_rule_t *R = G->pNext;
+ sym_t *s;
+ int sum;
+
+ while (R) {
+ sum = 0;
+ s = R->ss;
+
+ while (s) {
+ if (s->c >= 0) {
+ if (s->offset + s->c > sum) {
+ sum = s->offset + s->c;
+ }
+ }
+
+ if (s->c < 0) {
+ if (s->offset + s->r->length > sum) {
+ sum = s->offset + s->r->length;
+ }
+ }
+
+ s = s->pNext;
+ }
+
+ R->length = sum;
+ R = R->pNext;
+ }
+
+ sum = 0;
+ s = G->ss;
+
+ while (s) {
+ if (s->c >= 0) {
+ if (s->offset + s->c > sum) {
+ sum = s->offset + s->c;
+ }
+ }
+
+ if (s->c < 0) {
+ if (s->offset + s->r->length > sum) {
+ sum = s->offset + s->r->length;
+ }
+ }
+
+ s = s->pNext;
+ }
+
+ G->length = sum;
+} \ No newline at end of file
OpenPOWER on IntegriCloud