Disabled external gits

This commit is contained in:
2022-04-07 18:46:57 +02:00
parent 88cb3426ad
commit 15e7120d6d
5316 changed files with 4563444 additions and 6 deletions

View File

@@ -0,0 +1,61 @@
#3.0.1
#3.1.1
#3.2.0
3.2.4
#5745:37f59e65eb6c
5891:d8652709345d # introduce AVX
#5893:24b4dc92c6d3 # merge
5895:997c2ef9fc8b # introduce FMA
#5904:e1eafd14eaa1 # complex and AVX
5908:f8ee3c721251 # improve packing with ptranspose
#5921:ca808bb456b0 # merge
#5927:8b1001f9e3ac
5937:5a4ca1ad8c53 # New gebp kernel handling up to 3 packets x 4 register-level blocks
#5949:f3488f4e45b2 # merge
#5969:e09031dccfd9 # Disable 3pX4 kernel on Altivec
#5992:4a429f5e0483 # merge
before-evaluators
#6334:f6a45e5b8b7c # Implement evaluator for sparse outer products
#6639:c9121c60b5c7
#6655:06f163b5221f # Properly detect FMA support on ARM
#6677:700e023044e7 # FMA has been wrongly disabled
#6681:11d31dafb0e3
#6699:5e6e8e10aad1 # merge default to tensors
#6726:ff2d2388e7b9 # merge default to tensors
#6742:0cbd6195e829 # merge default to tensors
#6747:853d2bafeb8f # Generalized the gebp apis
6765:71584fd55762 # Made the blocking computation aware of the l3 cache; Also optimized the blocking parameters to take into account the number of threads used for a computation
#6781:9cc5a931b2c6 # generalized gemv
#6792:f6e1daab600a # ensured that contractions that can be reduced to a matrix vector product
#6844:039efd86b75c # merge tensor
6845:7333ed40c6ef # change prefetching in gebp
#6856:b5be5e10eb7f # merge index conversion
#6893:c3a64aba7c70 # clean blocking size computation
#6898:6fb31ebe6492 # rotating kernel for ARM
6899:877facace746 # rotating kernel for ARM only
#6904:c250623ae9fa # result_of
6921:915f1b1fc158 # fix prefetching change for ARM
6923:9ff25f6dacc6 # prefetching
6933:52572e60b5d3 # blocking size strategy
6937:c8c042f286b2 # avoid redundant pack_rhs
6981:7e5d6f78da59 # dynamic loop swapping
6984:45f26866c091 # rm dynamic loop swapping, adjust lhs's micro panel height to fully exploit L1 cache
6986:a675d05b6f8f # blocking heuristic: block on the rhs in L1 if the lhs fit in L1.
7013:f875e75f07e5 # organize a little our default cache sizes, and use a saner default L1 outside of x86 (10% faster on Nexus 5)
7015:8aad8f35c955 # Refactor computeProductBlockingSizes to make room for the possibility of using lookup tables
7016:a58d253e8c91 # Polish lookup tables generation
7018:9b27294a8186 # actual_panel_rows computation should always be resilient to parameters not consistent with the known L1 cache size, see comment
7019:c758b1e2c073 # Provide a empirical lookup table for blocking sizes measured on a Nexus 5. Only for float, only for Android on ARM 32bit for now.
7085:627e039fba68 # Bug 986: add support for coefficient-based product with 0 depth.
7098:b6f1db9cf9ec # Bug 992: don't select a 3p GEMM path with non-vectorizable scalar types, this hits unsupported paths in symm/triangular products code
7591:09a8e2186610 # 3.3-alpha1
7650:b0f3c8f43025 # help clang inlining
#8744:74b789ada92a # Improved the matrix multiplication blocking in the case where mr is not a power of 2 (e.g on Haswell CPUs)
8789:efcb912e4356 # Made the index type a template parameter to evaluateProductBlockingSizes. Use numext::mini and numext::maxi instead of std::min/std::max to compute blocking sizes
8972:81d53c711775 # Don't optimize the processing of the last rows of a matrix matrix product in cases that violate the assumptions made by the optimized code path
8985:d935df21a082 # Remove the rotating kernel.
8988:6c2dc56e73b3 # Bug 256: enable vectorization with unaligned loads/stores.
9148:b8b8c421e36c # Relax mixing-type constraints for binary coefficient-wise operators
9174:d228bc282ac9 # merge
9212:c90098affa7b # Fix performance regression introduced in changeset 8aad8f35c955
9213:9f1c14e4694b # Fix performance regression in dgemm introduced by changeset 81d53c711775

View File

@@ -0,0 +1,67 @@
#include <iostream>
#include <fstream>
#include <vector>
#include <Eigen/Core>
#include "../../BenchTimer.h"
using namespace Eigen;
#ifndef SCALAR
#error SCALAR must be defined
#endif
typedef SCALAR Scalar;
typedef Matrix<Scalar,Dynamic,Dynamic> Mat;
EIGEN_DONT_INLINE
void gemm(const Mat &A, const Mat &B, Mat &C)
{
C.noalias() += A * B;
}
EIGEN_DONT_INLINE
double bench(long m, long n, long k)
{
Mat A(m,k);
Mat B(k,n);
Mat C(m,n);
A.setRandom();
B.setRandom();
C.setZero();
BenchTimer t;
double up = 1e8*4/sizeof(Scalar);
double tm0 = 4, tm1 = 10;
if(NumTraits<Scalar>::IsComplex)
{
up /= 4;
tm0 = 2;
tm1 = 4;
}
double flops = 2. * m * n * k;
long rep = std::max(1., std::min(100., up/flops) );
long tries = std::max(tm0, std::min(tm1, up/flops) );
BENCH(t, tries, rep, gemm(A,B,C));
return 1e-9 * rep * flops / t.best();
}
int main(int argc, char **argv)
{
std::vector<double> results;
std::ifstream settings("gemm_settings.txt");
long m, n, k;
while(settings >> m >> n >> k)
{
//std::cerr << " Testing " << m << " " << n << " " << k << std::endl;
results.push_back( bench(m, n, k) );
}
std::cout << RowVectorXd::Map(results.data(), results.size());
return 0;
}

View File

@@ -0,0 +1,15 @@
8 8 8
9 9 9
24 24 24
239 239 239
240 240 240
2400 24 24
24 2400 24
24 24 2400
24 2400 2400
2400 24 2400
2400 2400 24
2400 2400 64
4800 23 160
23 4800 160
2400 2400 2400

View File

@@ -0,0 +1,98 @@
#include <iostream>
#include <fstream>
#include <vector>
#include <Eigen/Core>
#include "../../BenchTimer.h"
using namespace Eigen;
#ifndef SCALAR
#error SCALAR must be defined
#endif
typedef SCALAR Scalar;
template<typename MatA, typename MatB, typename MatC>
EIGEN_DONT_INLINE
void lazy_gemm(const MatA &A, const MatB &B, MatC &C)
{
// escape((void*)A.data());
// escape((void*)B.data());
C.noalias() += A.lazyProduct(B);
// escape((void*)C.data());
}
template<int m, int n, int k, int TA>
EIGEN_DONT_INLINE
double bench()
{
typedef Matrix<Scalar,m,k,TA> MatA;
typedef Matrix<Scalar,k,n> MatB;
typedef Matrix<Scalar,m,n> MatC;
MatA A(m,k);
MatB B(k,n);
MatC C(m,n);
A.setRandom();
B.setRandom();
C.setZero();
BenchTimer t;
double up = 1e7*4/sizeof(Scalar);
double tm0 = 10, tm1 = 20;
double flops = 2. * m * n * k;
long rep = std::max(10., std::min(10000., up/flops) );
long tries = std::max(tm0, std::min(tm1, up/flops) );
BENCH(t, tries, rep, lazy_gemm(A,B,C));
return 1e-9 * rep * flops / t.best();
}
template<int m, int n, int k>
double bench_t(int t)
{
if(t)
return bench<m,n,k,RowMajor>();
else
return bench<m,n,k,0>();
}
EIGEN_DONT_INLINE
double bench_mnk(int m, int n, int k, int t)
{
int id = m*10000 + n*100 + k;
switch(id) {
case 10101 : return bench_t< 1, 1, 1>(t); break;
case 20202 : return bench_t< 2, 2, 2>(t); break;
case 30303 : return bench_t< 3, 3, 3>(t); break;
case 40404 : return bench_t< 4, 4, 4>(t); break;
case 50505 : return bench_t< 5, 5, 5>(t); break;
case 60606 : return bench_t< 6, 6, 6>(t); break;
case 70707 : return bench_t< 7, 7, 7>(t); break;
case 80808 : return bench_t< 8, 8, 8>(t); break;
case 90909 : return bench_t< 9, 9, 9>(t); break;
case 101010 : return bench_t<10,10,10>(t); break;
case 111111 : return bench_t<11,11,11>(t); break;
case 121212 : return bench_t<12,12,12>(t); break;
}
return 0;
}
int main(int argc, char **argv)
{
std::vector<double> results;
std::ifstream settings("lazy_gemm_settings.txt");
long m, n, k, t;
while(settings >> m >> n >> k >> t)
{
//std::cerr << " Testing " << m << " " << n << " " << k << std::endl;
results.push_back( bench_mnk(m, n, k, t) );
}
std::cout << RowVectorXd::Map(results.data(), results.size());
return 0;
}

View File

@@ -0,0 +1,15 @@
1 1 1 0
2 2 2 0
3 3 3 0
4 4 4 0
4 4 4 1
5 5 5 0
6 6 6 0
7 7 7 0
7 7 7 1
8 8 8 0
9 9 9 0
10 10 10 0
11 11 11 0
12 12 12 0
12 12 12 1

View File

@@ -0,0 +1,38 @@
#!/bin/bash
# base name of the bench
# it reads $1.out
# and generates $1.pdf
WHAT=$1
bench=$2
header="rev "
while read line
do
if [ ! -z '$line' ]; then
header="$header \"$line\""
fi
done < $bench"_settings.txt"
echo $header > $WHAT.out.header
cat $WHAT.out >> $WHAT.out.header
echo "set title '$WHAT'" > $WHAT.gnuplot
echo "set key autotitle columnhead outside " >> $WHAT.gnuplot
echo "set xtics rotate 1" >> $WHAT.gnuplot
echo "set term pdf color rounded enhanced fontscale 0.35 size 7in,5in" >> $WHAT.gnuplot
echo set output "'"$WHAT.pdf"'" >> $WHAT.gnuplot
col=`cat $bench"_settings.txt" | wc -l`
echo "plot for [col=2:$col+1] '$WHAT.out.header' using 0:col:xticlabels(1) with lines" >> $WHAT.gnuplot
echo " " >> $WHAT.gnuplot
gnuplot -persist < $WHAT.gnuplot
# generate a png file
# convert -background white -density 120 -rotate 90 -resize 800 +dither -colors 256 -quality 0 $WHAT.ps -background white -flatten .$WHAT.png
# clean
rm $WHAT.out.header $WHAT.gnuplot

View File

@@ -0,0 +1,156 @@
#!/bin/bash
# ./run.sh gemm
# ./run.sh lazy_gemm
# Examples of environment variables to be set:
# PREFIX="haswell-fma-"
# CXX_FLAGS="-mfma"
# Options:
# -up : enforce the recomputation of existing data, and keep best results as a merging strategy
# -s : recompute selected changesets only and keep bests
bench=$1
if echo "$*" | grep '\-up' > /dev/null; then
update=true
else
update=false
fi
if echo "$*" | grep '\-s' > /dev/null; then
selected=true
else
selected=false
fi
global_args="$*"
if [ $selected == true ]; then
echo "Recompute selected changesets only and keep bests"
elif [ $update == true ]; then
echo "(Re-)Compute all changesets and keep bests"
else
echo "Skip previously computed changesets"
fi
if [ ! -d "eigen_src" ]; then
hg clone https://bitbucket.org/eigen/eigen eigen_src
else
cd eigen_src
hg pull -u
cd ..
fi
if [ ! -z '$CXX' ]; then
CXX=g++
fi
function make_backup
{
if [ -f "$1.out" ]; then
mv "$1.out" "$1.backup"
fi
}
function merge
{
count1=`echo $1 | wc -w`
count2=`echo $2 | wc -w`
if [ $count1 == $count2 ]; then
a=( $1 ); b=( $2 )
res=""
for (( i=0 ; i<$count1 ; i++ )); do
ai=${a[$i]}; bi=${b[$i]}
tmp=`echo "if ($ai > $bi) $ai else $bi " | bc -l`
res="$res $tmp"
done
echo $res
else
echo $1
fi
}
function test_current
{
rev=$1
scalar=$2
name=$3
prev=""
if [ -e "$name.backup" ]; then
prev=`grep $rev "$name.backup" | cut -c 14-`
fi
res=$prev
count_rev=`echo $prev | wc -w`
count_ref=`cat $bench"_settings.txt" | wc -l`
if echo "$global_args" | grep "$rev" > /dev/null; then
rev_found=true
else
rev_found=false
fi
# echo $update et $selected et $rev_found because $rev et "$global_args"
# echo $count_rev et $count_ref
if [ $update == true ] || [ $count_rev != $count_ref ] || ([ $selected == true ] && [ $rev_found == true ]); then
if $CXX -O2 -DNDEBUG -march=native $CXX_FLAGS -I eigen_src $bench.cpp -DSCALAR=$scalar -o $name; then
curr=`./$name`
if [ $count_rev == $count_ref ]; then
echo "merge previous $prev"
echo "with new $curr"
else
echo "got $curr"
fi
res=`merge "$curr" "$prev"`
# echo $res
echo "$rev $res" >> $name.out
else
echo "Compilation failed, skip rev $rev"
fi
else
echo "Skip existing results for $rev / $name"
echo "$rev $res" >> $name.out
fi
}
make_backup $PREFIX"s"$bench
make_backup $PREFIX"d"$bench
make_backup $PREFIX"c"$bench
cut -f1 -d"#" < changesets.txt | grep -E '[[:alnum:]]' | while read rev
do
if [ ! -z '$rev' ]; then
echo "Testing rev $rev"
cd eigen_src
hg up -C $rev > /dev/null
actual_rev=`hg identify | cut -f1 -d' '`
cd ..
test_current $actual_rev float $PREFIX"s"$bench
test_current $actual_rev double $PREFIX"d"$bench
test_current $actual_rev "std::complex<double>" $PREFIX"c"$bench
fi
done
echo "Float:"
cat $PREFIX"s""$bench.out"
echo " "
echo "Double:"
cat $PREFIX"d""$bench.out"
echo ""
echo "Complex:"
cat $PREFIX"c""$bench.out"
echo ""
./make_plot.sh $PREFIX"s"$bench $bench
./make_plot.sh $PREFIX"d"$bench $bench
./make_plot.sh $PREFIX"c"$bench $bench