Merge remote-tracking branch 'linkotec/master'

author: Haruki Hasegawa <h6a.h4i.0@gmail.com> 2016-05-05 13:24:55 +0900
committer: Haruki Hasegawa <h6a.h4i.0@gmail.com> 2016-05-05 13:24:55 +0900
commit: da3213cf045e0c7c4971d8b44272d1d86d689ceb (patch)
tree: 33e17e0a166f03307ebf11e8ab2891ae1ab90f61 /src/ffts_real_nd.c
parent: fa1780c68593762b1e4bdbc46d83912db3eba27a (diff)
parent: 944d14c9151f6b20145de0cdae38e366e73c9432 (diff)
download: ffts-da3213cf045e0c7c4971d8b44272d1d86d689ceb.zip
ffts-da3213cf045e0c7c4971d8b44272d1d86d689ceb.tar.gz
1 files changed, 241 insertions, 169 deletions
diff --git a/src/ffts_real_nd.c b/src/ffts_real_nd.c
index fe9ef69..89ef7f7 100644
--- a/src/ffts_real_nd.c
+++ b/src/ffts_real_nd.c
@@ -1,197 +1,269 @@
 /*
- 
- This file is part of FFTS -- The Fastest Fourier Transform in the South
-  
- Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
- Copyright (c) 2012, The University of Waikato 
- 
- All rights reserved.
-
- Redistribution and use in source and binary forms, with or without
- modification, are permitted provided that the following conditions are met:
- 	* Redistributions of source code must retain the above copyright
- 		notice, this list of conditions and the following disclaimer.
- 	* Redistributions in binary form must reproduce the above copyright
- 		notice, this list of conditions and the following disclaimer in the
- 		documentation and/or other materials provided with the distribution.
- 	* Neither the name of the organization nor the
-	  names of its contributors may be used to endorse or promote products
- 		derived from this software without specific prior written permission.
-
- THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
- ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
- WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
- DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
- (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
- LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
- SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+This file is part of FFTS -- The Fastest Fourier Transform in the South
+
+Copyright (c) 2012, Anthony M. Blake <amb@anthonix.com>
+Copyright (c) 2012, The University of Waikato
+
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in the
+documentation and/or other materials provided with the distribution.
+* Neither the name of the organization nor the
+names of its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL ANTHONY M. BLAKE BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 */
 
 #include "ffts_real_nd.h"
+#include "ffts_real.h"
+#include "ffts_internal.h"
+#include "ffts_transpose.h"
+
+static void
+ffts_free_nd_real(ffts_plan_t *p)
+{
+    if (p->plans) {
+        int i, j;
+
+        for (i = 0; i < p->rank; i++) {
+            ffts_plan_t *plan = p->plans[i];
+
+			if (plan) {
+				for (j = 0; j < i; j++) {
+					if (p->Ns[i] == p->Ns[j]) {
+						plan = NULL;
+						break;
+					}
+				}
+
+				if (plan) {
+					ffts_free(plan);
+				}
+			}
+        }
 
-#ifdef __ARM_NEON__
-#include "neon.h"
-#endif
-
-void ffts_free_nd_real(ffts_plan_t *p) {
-
-	int i;
-	for(i=0;i<p->rank;i++) {
-		
-		ffts_plan_t *x = p->plans[i];
-
-		int k;
-		for(k=i+1;k<p->rank;k++) {
-			if(x == p->plans[k]) p->plans[k] = NULL;
-		}
-		
-		if(x)	ffts_free(x);
-	}
-
-	free(p->Ns);
-	free(p->Ms);
-	free(p->plans);
-	free(p->buf);
-	free(p->transpose_buf);
-	free(p);
-}
+        free(p->plans);
+    }
+
+    if (p->buf) {
+        ffts_aligned_free(p->buf);
+    }
 
-void ffts_scalar_transpose(uint64_t *in, uint64_t *out, int w, int h, uint64_t *buf) {
+    if (p->Ns) {
+        free(p->Ns);
+    }
 
-	size_t i,j;
-	for(i=0;i<w;i+=1) {
-		for(j=0;j<h;j+=1) {
-			out[i*h + j] = in[j*w + i];
-		}
-	}
+    if (p->Ms) {
+        free(p->Ms);
+    }
 
+    free(p);
 }
 
-void ffts_execute_nd_real(ffts_plan_t *p, const void *  in, void *  out) {
+static void
+ffts_execute_nd_real(ffts_plan_t *p, const void *in, void *out)
+{
+    const size_t Ms0 = p->Ms[0];
+    const size_t Ns0 = p->Ns[0];
+
+    uint32_t *din = (uint32_t*) in;
+    uint64_t *buf = p->buf;
+    uint64_t *dout = (uint64_t*) out;
+
+    ffts_plan_t *plan;
+    int i;
+    size_t j;
+
+    plan = p->plans[0];
+    for (j = 0; j < Ns0; j++) {
+        plan->transform(plan, din + (j * Ms0), buf + (j * (Ms0 / 2 + 1)));
+    }
 
-	uint32_t *din = (uint32_t *)in;
-	uint64_t *buf = p->buf;
-	uint64_t *dout = (uint64_t *)out;
+    ffts_transpose(buf, dout, Ms0 / 2 + 1, Ns0);
 
-	size_t i,j;
-	for(i=0;i<p->Ns[0];i++) {
-		p->plans[0]->transform(p->plans[0], din + (i * p->Ms[0]), buf + (i * (p->Ms[0] / 2 + 1)));	
-	}
-	ffts_scalar_transpose(buf, dout, p->Ms[0] / 2 + 1, p->Ns[0], p->transpose_buf);	
+    for (i = 1; i < p->rank; i++) {
+        const size_t Ms = p->Ms[i];
+        const size_t Ns = p->Ns[i];
 
-	for(i=1;i<p->rank;i++) {
-		for(j=0;j<p->Ns[i];j++) { 
-			p->plans[i]->transform(p->plans[i], dout + (j * p->Ms[i]), buf + (j * p->Ms[i]));	
-		}
-		ffts_scalar_transpose(buf, dout, p->Ms[i], p->Ns[i], p->transpose_buf);	
-	}
+        plan = p->plans[i];
+
+        for (j = 0; j < Ns; j++) {
+            plan->transform(plan, dout + (j * Ms), buf + (j * Ms));
+        }
+
+        ffts_transpose(buf, dout, Ms, Ns);
+    }
 }
 
-void ffts_execute_nd_real_inv(ffts_plan_t *p, const void *  in, void *  out) {
+static void
+ffts_execute_nd_real_inv(ffts_plan_t *p, const void *in, void *out)
+{
+    const size_t Ms0 = p->Ms[0];
+    const size_t Ms1 = p->Ms[1];
+    const size_t Ns0 = p->Ns[0];
+    const size_t Ns1 = p->Ns[1];
+
+    uint64_t *din = (uint64_t*) in;
+    uint64_t *buf = p->buf;
+    uint64_t *buf2;
+    float    *doutr = (float*) out;
 
-	uint64_t *din = (uint64_t *)in;
-	uint64_t *buf = p->buf;
-	uint64_t *buf2;
-	uint64_t *dout = (uint64_t *)out;
-	size_t vol = 1;
-	
-	float *bufr = (float *)(p->buf);
-	float *doutr = (float *)out;
+    ffts_plan_t *plan;
+    size_t vol;
 
-	size_t i,j;
+    int i;
+    size_t j;
 
-	for(i=0;i<p->rank;i++) {
-		vol *= p->Ns[i];
-	}
+    vol = p->Ns[0];
+    for (i = 1; i < p->rank; i++) {
+        vol *= p->Ns[i];
+    }
 
-	buf2 = buf + vol;
+    buf2 = buf + vol;
 
-	ffts_scalar_transpose(din, buf, p->Ms[0], p->Ns[0], p->transpose_buf);	
+    ffts_transpose(din, buf, Ms0, Ns0);
 
-	for(i=0;i<p->Ms[0];i++) {
-		p->plans[0]->transform(p->plans[0], buf + (i * p->Ns[0]), buf2 + (i * p->Ns[0]));	
-	}
-	
-	ffts_scalar_transpose(buf2, buf, p->Ns[0], p->Ms[0], p->transpose_buf);	
-	for(j=0;j<p->Ms[1];j++) { 
-  	p->plans[1]->transform(p->plans[1], buf + (j * (p->Ms[0])), &doutr[j * p->Ns[1]]);	
-  }
+    plan = p->plans[0];
+    for (j = 0; j < Ms0; j++) {
+        plan->transform(plan, buf + (j * Ns0), buf2 + (j * Ns0));
+    }
+
+    ffts_transpose(buf2, buf, Ns0, Ms0);
+
+    plan = p->plans[1];
+    for (j = 0; j < Ms1; j++) {
+        plan->transform(plan, buf + (j * Ms0), &doutr[j * Ns1]);
+    }
 }
 
-ffts_plan_t *ffts_init_nd_real(int rank, size_t *Ns, int sign) {
-	size_t vol = 1;
-	size_t bufsize;
-
-	ffts_plan_t *p = malloc(sizeof(ffts_plan_t));
-
-	if(sign < 0) p->transform = &ffts_execute_nd_real;
-	else         p->transform = &ffts_execute_nd_real_inv;
-
-	p->destroy = &ffts_free_nd_real;
-
-	p->rank = rank;
-	p->Ns = malloc(sizeof(size_t) * rank);
-	p->Ms = malloc(sizeof(size_t) * rank);
-	p->plans = malloc(sizeof(ffts_plan_t **) * rank);
-	int i;
-	for(i=0;i<rank;i++) {
-		p->Ns[i] = Ns[i];
-		vol *= Ns[i];	
-	}
-	
-	//There is probably a prettier way of doing this, but it works..
-	if(sign < 0) {
-		bufsize = 2 * vol;
-	}
-	else {
-		bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
-	}
-
-	p->buf = valloc(sizeof(float) * bufsize);
-
-	for(i=0;i<rank;i++) {
-		p->Ms[i] = vol / p->Ns[i];
-		
-		p->plans[i] = NULL;
-		int k;
-
-		if(sign < 0) {
-			for(k=1;k<i;k++) {
-				if(p->Ms[k] == p->Ms[i]) p->plans[i] = p->plans[k];
-			}
-			if(!i)                p->plans[i] = ffts_init_1d_real(p->Ms[i], sign); 
-			else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ms[i], sign); 
-		}else{
-  		for(k=0;k<i;k++) {
-  			if(p->Ns[k] == p->Ns[i]) p->plans[i] = p->plans[k];
-  		}
-			if(i==rank-1)         p->plans[i] = ffts_init_1d_real(p->Ns[i], sign); 
-			else if(!p->plans[i]) p->plans[i] = ffts_init_1d(p->Ns[i], sign); 
-		}
-	}
-	if(sign < 0) {
-		for(i=1;i<rank;i++) {
-			p->Ns[i] = p->Ns[i] / 2 + 1;
-		}
-	}else{
-		for(i=0;i<rank-1;i++) {
-			p->Ms[i] = p->Ms[i] / 2 + 1;
-		}
-	}
-
-	p->transpose_buf = valloc(sizeof(float) * 2 * 8 * 8);
-	return p;
+FFTS_API ffts_plan_t*
+ffts_init_nd_real(int rank, size_t *Ns, int sign)
+{
+    int i;
+    size_t vol = 1;
+    size_t bufsize;
+    ffts_plan_t *p;
+
+    p = (ffts_plan_t*) calloc(1, sizeof(*p));
+    if (!p) {
+        return NULL;
+    }
+
+    if (sign < 0) {
+        p->transform = &ffts_execute_nd_real;
+    } else {
+        p->transform = &ffts_execute_nd_real_inv;
+    }
+
+    p->destroy = &ffts_free_nd_real;
+    p->rank    = rank;
+
+    p->Ms = (size_t*) malloc(rank * sizeof(*p->Ms));
+    if (!p->Ms) {
+        goto cleanup;
+    }
+
+    p->Ns = (size_t*) malloc(rank * sizeof(*p->Ns));
+    if (!p->Ns) {
+        goto cleanup;
+    }
+
+    for (i = 0; i < rank; i++) {
+        p->Ns[i] = Ns[i];
+        vol *= Ns[i];
+    }
+
+    /* there is probably a prettier way of doing this, but it works.. */
+    if (sign < 0) {
+        bufsize = 2 * vol;
+    } else {
+        bufsize = 2 * (Ns[0] * ((vol / Ns[0]) / 2 + 1) + vol);
+    }
+
+    p->buf = ffts_aligned_malloc(bufsize * sizeof(float));
+    if (!p->buf) {
+        goto cleanup;
+    }
+
+    p->plans = (ffts_plan_t**) calloc(rank, sizeof(*p->plans));
+    if (!p->plans) {
+        goto cleanup;
+    }
+
+    for (i = 0; i < rank; i++) {
+        int k;
+
+        p->Ms[i] = vol / p->Ns[i];
+
+        if (sign < 0) {
+            if (!i) {
+                p->plans[i] = ffts_init_1d_real(p->Ms[i], sign);
+            } else {
+                for (k = 1; k < i; k++) {
+                    if (p->Ms[k] == p->Ms[i]) {
+                        p->plans[i] = p->plans[k];
+                        break;
+                    }
+                }
+
+                if (!p->plans[i]) {
+                    p->plans[i] = ffts_init_1d(p->Ms[i], sign);
+                    p->Ns[i] = p->Ns[i] / 2 + 1;
+                }
+            }
+        } else {
+            if (i == rank - 1) {
+                p->plans[i] = ffts_init_1d_real(p->Ns[i], sign);
+            } else {
+                for (k = 0; k < i; k++) {
+                    if (p->Ns[k] == p->Ns[i]) {
+                        p->plans[i] = p->plans[k];
+                        break;
+                    }
+                }
+
+                if (!p->plans[i]) {
+                    p->plans[i] = ffts_init_1d(p->Ns[i], sign);
+                    p->Ms[i] = p->Ms[i] / 2 + 1;
+                }
+            }
+        }
+
+        if (!p->plans[i]) {
+            goto cleanup;
+        }
+    }
+
+    return p;
+
+cleanup:
+    ffts_free_nd_real(p);
+    return NULL;
 }
 
+FFTS_API ffts_plan_t*
+ffts_init_2d_real(size_t N1, size_t N2, int sign)
+{
+    size_t Ns[2];
 
-ffts_plan_t *ffts_init_2d_real(size_t N1, size_t N2, int sign) {
-	size_t Ns[2];
-	Ns[0] = N1;
-	Ns[1] = N2;
-	return ffts_init_nd_real(2, Ns, sign);
+    Ns[0] = N1;
+    Ns[1] = N2;
+    return ffts_init_nd_real(2, Ns, sign);
 }
-// vim: set autoindent noexpandtab tabstop=3 shiftwidth=3:
author	Haruki Hasegawa <h6a.h4i.0@gmail.com>	2016-05-05 13:24:55 +0900
committer	Haruki Hasegawa <h6a.h4i.0@gmail.com>	2016-05-05 13:24:55 +0900
commit	da3213cf045e0c7c4971d8b44272d1d86d689ceb (patch)
tree	33e17e0a166f03307ebf11e8ab2891ae1ab90f61 /src/ffts_real_nd.c
parent	fa1780c68593762b1e4bdbc46d83912db3eba27a (diff)
parent	944d14c9151f6b20145de0cdae38e366e73c9432 (diff)
download	ffts-da3213cf045e0c7c4971d8b44272d1d86d689ceb.zip ffts-da3213cf045e0c7c4971d8b44272d1d86d689ceb.tar.gz