P99
test-p99-pow.c

an example for computations with large matricesThis uses the VLA array passing macros, P99_DO and P99_PARALLEL_DO

/* This may look like nonsense, but it really is -*- mode: C -*- */
/* */
/* Except for parts copied from previous work and as explicitly stated below, */
/* the author and copyright holder for this work is */
/* all rights reserved, 2011-2012 Jens Gustedt, INRIA, France */
/* */
/* This file is free software; it is part of the P99 project. */
/* You can redistribute it and/or modify it under the terms of the QPL as */
/* given in the file LICENSE. It is distributed without any warranty; */
/* without even the implied warranty of merchantability or fitness for a */
/* particular purpose. */
/* */
#include "p99_checkargs.h"
#include "p99_for.h"
#include "p99_map.h"
#include "p99_new.h"
#include "p99_atomic.h"
#ifdef _OPENMP
# include <omp.h>
#else
/* If this is not compiled with OpenMP support, just ignore the two
functions that we borrowed from them. */
#define omp_get_wtime(...) 0.0;
#define omp_set_num_threads(...) P99_NOP
#endif
/* Print a 2D double array, the (more or less) hidden function. */
void
printFunc(P99_AARG(double const, A, 2)) {
size_t const na0 = P99_ALEN(*A, 0);
size_t const na1 = P99_ALEN(*A, 1);
P99_DO(size_t, i, 0, na0) {
P99_DO(size_t, j, 0, na1) {
printf("%g\t", (*A)[i][j]);
}
printf("\n");
}
printf("\n");
fflush(0);
}
P99_CA_WRAP_DECLARE(printFunc, void, (P99_AARG(double const, A, 2)), (P99_ANAME(A, 2)), (), (2));
/* Print a 2D array, the user interface */
#define print(ARR) P99_CA_CALL(printFunc, (), (2), P99_ACALL(ARR, 2, double const))
/* Check a 2D double array if it is all 0.0, the (more or less) hidden function. */
/* The array is called C. The lengths are not accessible through a
name, but only trough the ::P99_ALEN macro. */
inline
bool
checkZeroFunc(P99_AARG(double const, C, 2)) {
atomic_flag ret = ATOMIC_FLAG_INIT;
P99_PARALLEL_DO(size_t, i, 0, P99_ALEN(*C, 0)) {
P99_PARALLEL_DO(size_t, j, 0, P99_ALEN(*C, 1)) {
if ((*C)[i][j] != 0.0) atomic_flag_test_and_set_explicit(&ret, memory_order_release);
}
}
return !atomic_flag_test_and_set_explicit(&ret, memory_order_acquire);
}
P99_CA_WRAP_DECLARE(checkZeroFunc, bool, (P99_AARG(double const, C, 2)), (P99_ANAME(C, 2)), (), (2));
/* The user interface. It receives just one argument the pointer to
the matrix. */
#define checkZero(VC) P99_CA_CALL(checkZeroFunc, (), (2), P99_ACALL(VC, 2, double const))
/* Compute the dot-product of two double vectors, the (more or less) hidden function. */
/* The two vectors are called A and B. Their length are not accessible
through a name, but only trough the ::P99_ALEN macro. */
/* The parameter ret serves as an accumulator. */
inline
double
dotproductFunc(double ret,
P99_AARG(double const, A, 1),
P99_AARG(double const, B, 1)) {
assert(P99_ALEN(A, 1) == P99_ALEN(B, 1));
P99_DO(size_t, i, 0, P99_ALEN(A, 1))
ret += (*A)[i] * (*B)[i];
return ret;
}
P99_CA_WRAP_DECLARE(dotproductFunc,
double,
(double ret, P99_AARG(double const, A, 1), P99_AARG(double const, B, 1)),
(ret, P99_ANAME(A, 1), P99_ANAME(B, 1)), (), (2, 4));
/* A helper macro that puts the accumulator argument in front. */
#define dotproduct1(VA, VB, CAR) \
P99_CA_CALL(dotproductFunc, (), (2, 4), CAR, P99_ACALL(VA, 1, double const), P99_ACALL(VB, 1, double const))
/* A helper macro that translates a va_arg list into the five
arguments for dotproductFunc. */
#define dotproduct0(...) dotproduct1(__VA_ARGS__)
/* The user interface. It receives three arguments of which the last
is optional. */
#define dotproduct(...) dotproduct0(P99_CALL_DEFARG_LIST(dotproduct, 3, __VA_ARGS__))
/* Define the optional argument the accumulator to default to 0.0 */
#define dotproduct_defarg_2() 0.0
/* Compute the transpose of a 2D matrix, the (more or less) hidden function. */
/* The input matrix is called B, the length in the two dimensions are
called nb0 and nb1, respectively. */
/* The parameter buf provides the memory where the transpose should be
stored. It is also returned. */
inline
void*
transposeFunc(P99_AARG(double const, B, 2)) {
size_t const nb0 = P99_ALEN(*B, 0);
size_t const nb1 = P99_ALEN(*B, 1);
P99_AREF(double, A, nb1, nb0) = P99_ALLOC(*A);
P99_PARALLEL_DO(size_t, j, 0, nb0) {
/* sequentialize access to the rows of B */
P99_AREF(double const, BV, nb1) = &(*B)[j];
P99_PARALLEL_DO(size_t, i, 0, nb1) {
(*A)[i][j] = (*BV)[i];
}
}
return A;
}
P99_CA_WRAP_DECLARE(transposeFunc, void*, (P99_AARG(double const, B, 2)), (P99_ANAME(B, 2)), (), (2));
/* The user interface. */
#define transpose(VB) ((double const(*)[P99_ALEN(*VB, 1)][P99_ALEN(*VB, 1)])P99_CA_CALL(transposeFunc, (), (2), P99_ACALL(VB, 2, double const)))
/* Use constants that represent the sizes of blocks for a
decomposition of the matrices used by the mult function.
The total amount of elements used in the inner part of the
iteration will be (istep + jstep)*kstep for A and B, and
istep*jstep for C. */
enum {
/* A block should be such that several such submatrices will
reasonably fit into the cache on a decent machine. */
blocksize = (1u << 21)/sizeof(double),
/* istep * jstep should be such that the load into cache of any of
the submatrices is associated to a large number of operations. */
istep = (1u << 5),
jstep = (1u << 5),
/* The k-dimension of the matrices is accessed sequentially, so
this part should be longer to give the automatic prefetcher an
occasion to kick in. */
kstep = (blocksize - istep*jstep)/(istep + jstep)
};
/* Compute the product of two 2D matrices. The (more or less) hidden
function. */
/* The input matrices are called A and B. */
/* The matrix C receives the result. */
/* This function is not declared as inline. It will be way to big that
this would make sense. */
void
multFunc(P99_AARG(double const, A, 2),
P99_AARG(double const, B, 2),
P99_AREF(double, C, P99_ALEN(*A, 0), P99_ALEN(*B, 1)));
void,
(P99_AARG(double const, A, 2),
P99_AARG(double const, B, 2),
P99_AREF(double, C, P99_ALEN(*A, 0), P99_ALEN(*B, 1))),
(P99_ANAME(A, 2), P99_ANAME(B, 2), C),
(),
(2, 5, 6));
/* The user interface. It receives just the three pointers to
the matrix as arguments. */
#define mult(ARR, BRR, CRR) \
P99_CA_CALL(multFunc, \
(), \
(2, 5, 6), \
P99_ACALL(ARR, 2, double const), \
P99_ACALL(BRR, 2, double const), \
CRR)
/* All above would typically be written in a header (.h) file ***/
/****************************************************************/
/*************** end of interface section ***********************/
/****************************************************************/
/* From here on are only things that concern the implementation */
/* This starts with instantiations of for the inline function. **/
P99_INSTANTIATE(bool, checkZeroFunc,
P99_AARG(double const, C, 2));
P99_INSTANTIATE(double, dotproductFunc,
double,
P99_AARG(double const, A, 1),
P99_AARG(double const, B, 1));
P99_INSTANTIATE(void*, transposeFunc,
P99_AARG(double const, B, 2));
P99_INSTANTIATE(void, multFunc,
P99_AARG(double const, A, 2),
P99_AARG(double const, B, 2),
P99_AREF(double, C, P99_ALEN(*A, 0), P99_ALEN(*B, 1)));
P99_CA_WRAP_DEFINE(printFunc, void, (P99_AARG(double const, A, 2)), (P99_ANAME(A, 2)), (), (2));
P99_CA_WRAP_DEFINE(checkZeroFunc, bool, (P99_AARG(double const, C, 2)), (P99_ANAME(C, 2)), (), (2));
P99_CA_WRAP_DEFINE(dotproductFunc,
double,
(double ret, P99_AARG(double const, A, 1), P99_AARG(double const, B, 1)),
(ret, P99_ANAME(A, 1), P99_ANAME(B, 1)), (), (2, 4));
void,
(P99_AARG(double const, A, 2),
P99_AARG(double const, B, 2),
P99_AREF(double, C, P99_ALEN(*A, 0), P99_ALEN(*B, 1))),
(P99_ANAME(A, 2), P99_ANAME(B, 2), C),
(),
(2, 5, 6));
P99_CA_WRAP_DEFINE(transposeFunc,
void*,
(P99_AARG(double const, B, 2)),
(P99_ANAME(B, 2), buf),
(),
(2));
/* Matrix multiplication has three nested for loops. One for each
dimension and on for the dotproduct between the rows of A and the
columns of B. We split all these loops in two. One with a longer
step count, the values istep, jstep and kstep from above, and an
inner one that then goes through the individual elements.
For convenience we realize the inner three loops with step size 1
as a macro. This will help optimization where all the three
parameters ILEN, JLEN and KLEN correspond to compile time
constants, namely istep, jstep and kstep, respectively.
ILEN, JLEN and KLEN are the length of the steps in the 3
dimensions. Generally they will be equal to istep, jstep and kstep,
respectively, but will be less than that for the last iteration. */
#define INNER_CASE(C, A, B, ISTART, ILEN, JSTART, JLEN, KSTART, KLEN) \
/* The local type that handles a partial line. Two of these will be \
created below and then be used for the dot product. */ \
typedef double const kLineFrag[KLEN]; \
/* The local type for the partial line in the result matrix C. */ \
typedef double cLine[JLEN]; \
/* None of the loops are parallelized at this level. All threads \
should have enough work to do such that they may reuse their \
caches well. */ \
P99_DO(size_t, i0, ISTART, ILEN) { \
register kLineFrag* AF = (kLineFrag*)&((*A)[i0][KSTART]); \
register cLine*restrict CL = (cLine*)&(*C)[i0][JSTART]; \
P99_DO(size_t, j0, JSTART, JLEN) { \
register kLineFrag* BF = (kLineFrag*)&((*B)[j0][KSTART]); \
register double*restrict c = &(*CL)[j0-JSTART]; \
*c = dotproduct(AF, BF, *c); \
} \
}
/* The implementation of the function itself. */
void
multFunc(P99_AARG(double const, A, 2),
P99_AARG(double const, B, 2),
P99_AREF(double, C, P99_ALEN(*A, 0), P99_ALEN(*B, 1))) {
size_t const na0 = P99_ALEN(*A, 0);
size_t const na1 = P99_ALEN(*A, 1);
size_t const nb0 = P99_ALEN(*B, 0);
size_t const nb1 = P99_ALEN(*B, 1);
/* check that the dimensions of the matrices fit */
assert(na1 == nb0);
double times0 = omp_get_wtime();
/* transpose B to have its columns consecutive in memory */
double const P99_ARRAY(*BS, nb1, nb0) = transpose(B);
assert(checkZero(C));
double times1 = omp_get_wtime();
/* The loop for the dot product cannot be easily parallelized
because it accumulates the result. If we would want to do this,
we would need to use reduction for the entries of C. */
P99_DO(size_t, k, 0, na1, kstep) {
/* The loops in the 2 dimensions can be parallelized because every
entry of the result is computed independently. Organize a
sufficiently large block for every individual thread such that
it is able to reuse cached values repeatedly. For the general
execution a thread works on
(istep + jstep)*kstep
elements for matrices A and B, C must not necessarily be
cached. On these it performs istep*jstep*kstep operations, so
istep*jstep / (istep + jstep)
or about istep/2 operations per entry. */
P99_PARALLEL_DO(size_t, i, 0, na0, istep) {
P99_PARALLEL_DO(size_t, j, 0, nb1, jstep) {
register size_t const kMax = (na1 - k);
register size_t const iMax = (na0 - i);
register size_t const jMax = (nb1 - j);
if (P99_LIKELY(iMax >= istep)) {
if (P99_LIKELY(jMax >= jstep)) {
if (P99_LIKELY(kMax >= kstep)) {
/* This is the principal case for which we want to
optimize as much as possible. */
INNER_CASE(C, A, BS, i, istep, j, jstep, k, kstep);
} else {
INNER_CASE(C, A, BS, i, istep, j, jstep, k, kMax);
}
} else {
if (kMax >= kstep) {
INNER_CASE(C, A, BS, i, istep, j, jMax, k, kstep);
} else {
INNER_CASE(C, A, BS, i, istep, j, jMax, k, kMax);
}
}
} else {
if (jMax >= jstep) {
if (kMax >= kstep) {
INNER_CASE(C, A, BS, i, iMax, j, jstep, k, kstep);
} else {
INNER_CASE(C, A, BS, i, iMax, j, jstep, k, kMax);
}
} else {
if (kMax >= kstep) {
INNER_CASE(C, A, BS, i, iMax, j, jMax, k, kstep);
} else {
INNER_CASE(C, A, BS, i, iMax, j, jMax, k, kMax);
}
}
}
}
}
}
double times2 = omp_get_wtime();
printf("%s time:\t%g\n", "setup", times1 - times0);
printf("%s time:\t%g\n", "mult", times2 - times1);
free((void*)BS);
}
int main(int argc, char*argv[]) {
#if __STDC_WANT_LIB_EXT1__
set_constraint_handler_s(abort_handler_s);
#endif
if (argc <= 3) {
fprintf(stderr, "Usage: %s n k m p e\n", argv[0]);
fputs(" for matrices C[n][m] = A[n][k] * B[k][m]\n", stderr);
fputs(" with p OpenMP processors\n", stderr);
fputs(" error generation e: 0 - no error\n", stderr);
fputs(" 1 - size error in C\n", stderr);
fputs(" 2 - null pointer for C\n", stderr);
return EXIT_SUCCESS;
}
size_t p = 1;
if (argc > 4)
p = strtoul(argv[4], NULL, 0);
omp_set_num_threads(p);
size_t err = 0;
if (argc > 5)
err = strtoul(argv[5], NULL, 0);
double const P99_ARRAY(A, 3, 2) = {
{2, 3 },
{1, 1 },
{-1, 0}
};
P99_AREF(double const, AP, 3, 2) = &A;
print(AP);
double const P99_ARRAY(B, 2, 4) = {
{2, 3, -1, 0},
{1, 1, 2, 1},
};
P99_AREF(double const, BP, 2, 4) = &B;
print(BP);
dotproduct(&(*BP)[0], &(*BP)[1]);
double P99_ARRAY(C, 3, 4) = {
{0, 0, 0, 0},
{0, 0, 0, 0},
{0, 0, 0, 0},
};
P99_AREF(double, CP, 3, 4) = &C;
mult(AP, BP, CP);
print(CP);
/* Now demonstrate the use of the matrix multiplication code with
matrices that have dynamic length. */
register const size_t n = strtoul(argv[1]);
register const size_t m = strtoul(argv[2]);
register const size_t k = strtoul(argv[3]);
printf("matrix length are n=%zu, m=%zu, k=%zu\n", n, m, k);
printf("step length are n=%d, m=%d, k=%d\n", istep, jstep, kstep);
/* Allocate the three matrices. The matrix dimensions are only
specified on the left side. The allocation expression only uses
the base type and sizeof information to do the allocation.
Also, each of n, m, k occurs exactly once, so the other
dimensions are enforced. */
P99_AREF(double, AR, n, /* */ k /* */) = P99_ALLOC(*AR);
P99_AREF(double, BR, P99_ALEN(*AR, 1), m /* */) = P99_ALLOC(*BR);
/* Test the error checks by perturbing matrix C:
- if err is 1 we add one to the size so a size mismatch should be detected
- if err is 2 we don't alloc but set to 0, so a null pointer should be detected */
P99_AREF(double, CR, P99_ALEN(*AR, 0)
/* */ + (err == 1 ? 1 : 0),
/* */ P99_ALEN(*BR, 1)) = (err == 2 ? 0 : P99_ALLOC(*CR, P99_LVAL(double)));
/* Initialize the two matrices to some arbitrary values */
/* This replaces the following nested loops: */
/* for (size_t j = 0; j < n; ++j) */
/* for (size_t i = 0; i < k; ++i) */
size_t const D[2] = { P99_ALEN(*AR, 0), P99_ALEN(*AR, 1) };
P99_FORALL(D, j, i) (*AR)[j][i] = i*j;
/* This replaces the following nested loops: */
/* for (size_t i = 0; i < k; ++i) */
/* for (size_t j = 0; j < m; ++j) */
size_t const E[2] = { P99_ALEN(*BR, 0), P99_ALEN(*BR, 1)};
P99_FORALL(E, i, j) (*BR)[i][j] = j + i;
/* run the multiplication itself */
mult(AR, BR, CR);
free(AR);
free(BR);
free(CR);
}
/* follow just some arbitrary compile tests */
enum { A = 3 };
size_t A0 = P99_IPOW(0, A);
size_t A1 = P99_IPOW(1, A);
size_t A2 = P99_IPOW(2, A);
size_t A3 = P99_IPOW(3, A);
size_t A4 = P99_IPOW(4, A);
p99_for.h
A preprocessor for loop implementation and some derived list handling macros.
p99_new.h
Macros for initialization and allocation.
P99_ARRAY
#define P99_ARRAY(ARR,...)
Definition: p99_for.h:596
P99_LIKELY
#define P99_LIKELY(...)
Mark the conditional expression as being likely.
Definition: p99_compiler.h:1010
P99_FORALL
#define P99_FORALL(NAME,...)
A multi-index for loop.
Definition: p99_for.h:956
p99_map.h
macros to produce lists of statements or declarations.
P99_ALEN
#define P99_ALEN(ARR, N)
Produce the length of the argument array ARR in terms of number of elements.
Definition: p99_for.h:678
P99_AARG
#define P99_AARG(TYPE, NAME, DIM, VAR)
Declare a pointer to array function argument of basetype TYPE, with name NAME, dimension DIM and nami...
Definition: p99_for.h:750
p
P00_CLAUSE2 p(P00_WEAK1(p00_cb))(P00_WEAK2(p00_cb)) pp99_callback_stack p00_at_quick_exit
p99_checkargs.h
Macros to check arguments to functions, in particular of variably modified types.
P99_UNUSED
#define P99_UNUSED(...)
check if the list of expressions is syntactically valid but don't evaluate it
Definition: p99_enum.h:215
P99_LVAL
#define P99_LVAL(...)
Define an lvalue of type T, where T is the first parameter in the variable parameter list.
Definition: p99_int.h:1084
P99_IPOW
#define P99_IPOW(N, X)
Compute the Nth multiplicative integer power of X.
Definition: p99_map.h:67
p99_atomic.h
P99_DO
#define P99_DO(TYPE, VAR, LOW, LEN, INCR)
A fortran like do-loop with bounds that are fixed at the beginning.
Definition: p99_for.h:882
P99_INSTANTIATE
#define P99_INSTANTIATE(RT, NAME,...)
Instantiate an inline function.
Definition: p99_defarg.h:241
main
#define main
strtoul
#define strtoul(...)
Default arguments for C99 function strtoul
Definition: p99_c99_default.h:206
i
P00_CLAUSE2 i(_Pragma("weak p00_getopt_comp"))(_Pragma("weak p00_getopt_comp
P99_AREF
#define P99_AREF(T, ARR,...)
Definition: p99_for.h:597
P99_ANAME
#define P99_ANAME(NAME, DIM, VAR)
Declare list of variable names as produced by P99_AARG.
Definition: p99_for.h:764
P99_ALLOC
#define P99_ALLOC(...)
Definition: p99_new.h:210
P99_CA_WRAP_DEFINE
#define P99_CA_WRAP_DEFINE(NAME, RET, TYPES, VARS,...)
Definition: p99_checkargs.h:188
P99_PARALLEL_DO
#define P99_PARALLEL_DO(TYPE, VAR, LOW, LEN, INCR)
as P99_DO but performs the iterations out of order
Definition: p99_for.h:889
p99_c99_default.h
P99_CA_WRAP_DECLARE
#define P99_CA_WRAP_DECLARE(NAME, RET, TYPES, VARS,...)
Definition: p99_checkargs.h:170