diff --git a/common_c.h b/common_c.h index 6cff610bb5..0fedd0cea7 100644 --- a/common_c.h +++ b/common_c.h @@ -1,3 +1,42 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + #ifndef COMMON_C_H #define COMMON_C_H @@ -62,6 +101,9 @@ #define CGEMM_ITCOPY cgemm_itcopy #endif +#define CCOMM_NCOPY ccomm_ncopy +#define CCOMM_TCOPY ccomm_tcopy + #define CTRMM_OUNUCOPY ctrmm_ounucopy #define CTRMM_OUNNCOPY ctrmm_ounncopy #define CTRMM_OUTUCOPY ctrmm_outucopy @@ -125,6 +167,11 @@ #define CGEMM_KERNEL_R cgemm_kernel_r #define CGEMM_KERNEL_B cgemm_kernel_b +#define CCOMM_KERNEL_N ccomm_kernel_n +#define CCOMM_KERNEL_L ccomm_kernel_l +#define CCOMM_KERNEL_R ccomm_kernel_r +#define CCOMM_KERNEL_B ccomm_kernel_b + #define CTRMM_KERNEL_LN ctrmm_kernel_LN #define CTRMM_KERNEL_LT ctrmm_kernel_LT #define CTRMM_KERNEL_LR ctrmm_kernel_LR @@ -320,17 +367,25 @@ #define CTRMM_IUTNCOPY gotoblas -> ctrmm_iutncopy #define CTRMM_ILNNCOPY gotoblas -> ctrmm_ilnncopy #define CTRMM_ILTNCOPY gotoblas -> ctrmm_iltncopy +#define CCOMM_NCOPY gotoblas -> ccomm_ncopy +#define CCOMM_TCOPY gotoblas -> ccomm_tcopy + #define CTRSM_IUNNCOPY gotoblas -> ctrsm_iunncopy #define CTRSM_IUTNCOPY gotoblas -> ctrsm_iutncopy #define CTRSM_ILNNCOPY gotoblas -> ctrsm_ilnncopy #define CTRSM_ILTNCOPY gotoblas -> ctrsm_iltncopy -#define CGEMM_BETA gotoblas -> cgemm_beta +#define CGEMM_BETA gotoblas -> cgemm_beta #define CGEMM_KERNEL_N gotoblas -> cgemm_kernel_n #define CGEMM_KERNEL_L gotoblas -> cgemm_kernel_l #define CGEMM_KERNEL_R gotoblas -> cgemm_kernel_r #define CGEMM_KERNEL_B gotoblas -> cgemm_kernel_b +#define CCOMM_KERNEL_N gotoblas -> ccomm_kernel_n +#define CCOMM_KERNEL_L gotoblas -> ccomm_kernel_l +#define CCOMM_KERNEL_R gotoblas -> ccomm_kernel_r +#define CCOMM_KERNEL_B gotoblas -> ccomm_kernel_b + #define CTRMM_KERNEL_LN gotoblas -> ctrmm_kernel_LN #define CTRMM_KERNEL_LT gotoblas -> ctrmm_kernel_LT #define CTRMM_KERNEL_LR gotoblas -> ctrmm_kernel_LR diff --git a/common_d.h b/common_d.h index 1e8c33d7a3..2212984fd1 100644 --- a/common_d.h +++ b/common_d.h @@ -1,3 +1,42 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + #ifndef COMMON_D_H #define COMMON_D_H @@ -55,6 +94,9 @@ #define DGEMM_ITCOPY dgemm_itcopy #endif +#define DCOMM_NCOPY dcomm_ncopy +#define DCOMM_TCOPY dcomm_tcopy + #define DTRMM_OUNUCOPY dtrmm_ounucopy #define DTRMM_OUNNCOPY dtrmm_ounncopy #define DTRMM_OUTUCOPY dtrmm_outucopy @@ -114,6 +156,7 @@ #define DGEMM_BETA dgemm_beta #define DGEMM_KERNEL dgemm_kernel +#define DCOMM_KERNEL dcomm_kernel #define DTRMM_KERNEL_LN dtrmm_kernel_LN #define DTRMM_KERNEL_LT dtrmm_kernel_LT @@ -239,13 +282,17 @@ #define DTRMM_IUTNCOPY gotoblas -> dtrmm_iutncopy #define DTRMM_ILNNCOPY gotoblas -> dtrmm_ilnncopy #define DTRMM_ILTNCOPY gotoblas -> dtrmm_iltncopy +#define DCOMM_NCOPY gotoblas -> dcomm_ncopy +#define DCOMM_TCOPY gotoblas -> dcomm_tcopy + #define DTRSM_IUNNCOPY gotoblas -> dtrsm_iunncopy #define DTRSM_IUTNCOPY gotoblas -> dtrsm_iutncopy #define DTRSM_ILNNCOPY gotoblas -> dtrsm_ilnncopy #define DTRSM_ILTNCOPY gotoblas -> dtrsm_iltncopy -#define DGEMM_BETA gotoblas -> dgemm_beta +#define DGEMM_BETA gotoblas -> dgemm_beta #define DGEMM_KERNEL gotoblas -> dgemm_kernel +#define DCOMM_KERNEL gotoblas -> dcomm_kernel #define DTRMM_KERNEL_LN gotoblas -> dtrmm_kernel_LN #define DTRMM_KERNEL_LT gotoblas -> dtrmm_kernel_LT diff --git a/common_level3.h b/common_level3.h index 39abe3016c..564162effb 100644 --- a/common_level3.h +++ b/common_level3.h @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -284,6 +285,8 @@ int strmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX int strmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int scomm_ncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int scomm_tcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int strmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int strmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); @@ -301,6 +304,8 @@ int dtrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG pos int dtrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int dcomm_tcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int dcomm_ncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int dtrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int dtrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); @@ -335,6 +340,8 @@ int ctrmm_ilnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX int ctrmm_ilnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_iltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_iltncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); +int ccomm_tcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); +int ccomm_ncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int ctrmm_olnucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_olnncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); int ctrmm_oltucopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, float *b); @@ -352,6 +359,8 @@ int ztrmm_ilnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG pos int ztrmm_ilnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_iltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_iltncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); +int zcomm_tcopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); +int zcomm_ncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, double *b); int ztrmm_olnucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_olnncopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); int ztrmm_oltucopy(BLASLONG m, BLASLONG n, double *a, BLASLONG lda, BLASLONG posX, BLASLONG posY, double *b); @@ -579,6 +588,8 @@ int bgemm_kernel(BLASLONG, BLASLONG, BLASLONG, bfloat16, bfloat16 *, bfloat16 * int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); +int scomm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); +int dcomm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); #ifdef QUAD_PRECISION int qgemm_kernel(BLASLONG, BLASLONG, BLASLONG, xidouble *, xidouble *, xidouble *, xdouble *, BLASLONG); @@ -728,6 +739,16 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); +int ccomm_kernel_n(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int ccomm_kernel_l(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int ccomm_kernel_r(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); +int ccomm_kernel_b(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + +int zcomm_kernel_n(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zcomm_kernel_l(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zcomm_kernel_r(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); +int zcomm_kernel_b(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, hfloat16 *, hfloat16 *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 745643fa89..30e95d5ba6 100644 --- a/common_macro.h +++ b/common_macro.h @@ -99,6 +99,8 @@ #define GEMM_OTCOPY QGEMM_OTCOPY #define GEMM_INCOPY QGEMM_INCOPY #define GEMM_ITCOPY QGEMM_ITCOPY +#define COMM_NCOPY QGEMM_INCOPY +#define COMM_TCOPY QGEMM_ITCOPY #ifdef UNIT @@ -149,6 +151,11 @@ #define GEMM_KERNEL_R QGEMM_KERNEL #define GEMM_KERNEL_B QGEMM_KERNEL +#define COMM_KERNEL_N QGEMM_KERNEL +#define COMM_KERNEL_L QGEMM_KERNEL +#define COMM_KERNEL_R QGEMM_KERNEL +#define COMM_KERNEL_B QGEMM_KERNEL + #define TRMM_KERNEL_LN QTRMM_KERNEL_LN #define TRMM_KERNEL_LT QTRMM_KERNEL_LT #define TRMM_KERNEL_LR QTRMM_KERNEL_LN @@ -391,6 +398,8 @@ #define GEMM_OTCOPY DGEMM_OTCOPY #define GEMM_INCOPY DGEMM_INCOPY #define GEMM_ITCOPY DGEMM_ITCOPY +#define COMM_NCOPY DCOMM_NCOPY +#define COMM_TCOPY DCOMM_TCOPY #ifdef UNIT @@ -407,6 +416,7 @@ #define TRMM_IUTCOPY DTRMM_IUTUCOPY #define TRMM_ILNCOPY DTRMM_ILNUCOPY #define TRMM_ILTCOPY DTRMM_ILTUCOPY + #define TRSM_IUNCOPY DTRSM_IUNUCOPY #define TRSM_IUTCOPY DTRSM_IUTUCOPY #define TRSM_ILNCOPY DTRSM_ILNUCOPY @@ -427,6 +437,7 @@ #define TRMM_IUTCOPY DTRMM_IUTNCOPY #define TRMM_ILNCOPY DTRMM_ILNNCOPY #define TRMM_ILTCOPY DTRMM_ILTNCOPY + #define TRSM_IUNCOPY DTRSM_IUNNCOPY #define TRSM_IUTCOPY DTRSM_IUTNCOPY #define TRSM_ILNCOPY DTRSM_ILNNCOPY @@ -441,6 +452,11 @@ #define GEMM_KERNEL_R DGEMM_KERNEL #define GEMM_KERNEL_B DGEMM_KERNEL +#define COMM_KERNEL_N DCOMM_KERNEL +#define COMM_KERNEL_L DCOMM_KERNEL +#define COMM_KERNEL_R DCOMM_KERNEL +#define COMM_KERNEL_B DCOMM_KERNEL + #define TRMM_KERNEL_LN DTRMM_KERNEL_LN #define TRMM_KERNEL_LT DTRMM_KERNEL_LT #define TRMM_KERNEL_LR DTRMM_KERNEL_LN @@ -820,6 +836,8 @@ #define GEMM_OTCOPY SBGEMM_OTCOPY #define GEMM_INCOPY SBGEMM_INCOPY #define GEMM_ITCOPY SBGEMM_ITCOPY +#define COMM_NCOPY SCOMM_NCOPY +#define COMM_TCOPY SCOMM_TCOPY #define SYMM_THREAD_LU SSYMM_THREAD_LU #define SYMM_THREAD_LL SSYMM_THREAD_LL #define SYMM_THREAD_RU SSYMM_THREAD_RU @@ -829,7 +847,6 @@ #define SYMM_RU SSYMM_RU #define SYMM_RL SSYMM_RL - #define HEMM_THREAD_LU SHEMM_THREAD_LU #define HEMM_THREAD_LL SHEMM_THREAD_LL #define HEMM_THREAD_RU SHEMM_THREAD_RU @@ -867,6 +884,7 @@ #define TRMM_IUTCOPY STRMM_IUTUCOPY #define TRMM_ILNCOPY STRMM_ILNUCOPY #define TRMM_ILTCOPY STRMM_ILTUCOPY + #define TRSM_IUNCOPY STRSM_IUNUCOPY #define TRSM_IUTCOPY STRSM_IUTUCOPY #define TRSM_ILNCOPY STRSM_ILNUCOPY @@ -887,6 +905,7 @@ #define TRMM_IUTCOPY STRMM_IUTNCOPY #define TRMM_ILNCOPY STRMM_ILNNCOPY #define TRMM_ILTCOPY STRMM_ILTNCOPY + #define TRSM_IUNCOPY STRSM_IUNNCOPY #define TRSM_IUTCOPY STRSM_IUTNCOPY #define TRSM_ILNCOPY STRSM_ILNNCOPY @@ -1102,6 +1121,8 @@ #define GEMM_OTCOPY SGEMM_OTCOPY #define GEMM_INCOPY SGEMM_INCOPY #define GEMM_ITCOPY SGEMM_ITCOPY +#define COMM_NCOPY SCOMM_NCOPY +#define COMM_TCOPY SCOMM_TCOPY #ifdef UNIT @@ -1118,6 +1139,7 @@ #define TRMM_IUTCOPY STRMM_IUTUCOPY #define TRMM_ILNCOPY STRMM_ILNUCOPY #define TRMM_ILTCOPY STRMM_ILTUCOPY + #define TRSM_IUNCOPY STRSM_IUNUCOPY #define TRSM_IUTCOPY STRSM_IUTUCOPY #define TRSM_ILNCOPY STRSM_ILNUCOPY @@ -1138,6 +1160,7 @@ #define TRMM_IUTCOPY STRMM_IUTNCOPY #define TRMM_ILNCOPY STRMM_ILNNCOPY #define TRMM_ILTCOPY STRMM_ILTNCOPY + #define TRSM_IUNCOPY STRSM_IUNNCOPY #define TRSM_IUTCOPY STRSM_IUTNCOPY #define TRSM_ILNCOPY STRSM_ILNNCOPY @@ -1152,6 +1175,11 @@ #define GEMM_KERNEL_R SGEMM_KERNEL #define GEMM_KERNEL_B SGEMM_KERNEL +#define COMM_KERNEL_N SCOMM_KERNEL +#define COMM_KERNEL_L SCOMM_KERNEL +#define COMM_KERNEL_R SCOMM_KERNEL +#define COMM_KERNEL_B SCOMM_KERNEL + #define TRMM_KERNEL_LN STRMM_KERNEL_LN #define TRMM_KERNEL_LT STRMM_KERNEL_LT #define TRMM_KERNEL_LR STRMM_KERNEL_LN @@ -1428,6 +1456,8 @@ #define GEMM_OTCOPY XGEMM_OTCOPY #define GEMM_INCOPY XGEMM_INCOPY #define GEMM_ITCOPY XGEMM_ITCOPY +#define COMM_NCOPY XGEMM_INCOPY +#define COMM_TCOPY XGEMM_ITCOPY #define GEMM3M_ONCOPYB XGEMM3M_ONCOPYB #define GEMM3M_ONCOPYR XGEMM3M_ONCOPYR @@ -1519,6 +1549,11 @@ #define GEMM_KERNEL_R XGEMM_KERNEL_R #define GEMM_KERNEL_B XGEMM_KERNEL_B +#define COMM_KERNEL_N XGEMM_KERNEL_N +#define COMM_KERNEL_L XGEMM_KERNEL_L +#define COMM_KERNEL_R XGEMM_KERNEL_R +#define COMM_KERNEL_B XGEMM_KERNEL_B + #define GEMM3M_KERNEL XGEMM3M_KERNEL #define TRMM_KERNEL_LN XTRMM_KERNEL_LN @@ -1830,6 +1865,8 @@ #define GEMM_OTCOPY ZGEMM_OTCOPY #define GEMM_INCOPY ZGEMM_INCOPY #define GEMM_ITCOPY ZGEMM_ITCOPY +#define COMM_NCOPY ZCOMM_NCOPY +#define COMM_TCOPY ZCOMM_TCOPY #define GEMM3M_ONCOPYB ZGEMM3M_ONCOPYB #define GEMM3M_ONCOPYR ZGEMM3M_ONCOPYR @@ -1859,6 +1896,7 @@ #define TRMM_IUTCOPY ZTRMM_IUTUCOPY #define TRMM_ILNCOPY ZTRMM_ILNUCOPY #define TRMM_ILTCOPY ZTRMM_ILTUCOPY + #define TRSM_IUNCOPY ZTRSM_IUNUCOPY #define TRSM_IUTCOPY ZTRSM_IUTUCOPY #define TRSM_ILNCOPY ZTRSM_ILNUCOPY @@ -1879,6 +1917,7 @@ #define TRMM_IUTCOPY ZTRMM_IUTNCOPY #define TRMM_ILNCOPY ZTRMM_ILNNCOPY #define TRMM_ILTCOPY ZTRMM_ILTNCOPY + #define TRSM_IUNCOPY ZTRSM_IUNNCOPY #define TRSM_IUTCOPY ZTRSM_IUTNCOPY #define TRSM_ILNCOPY ZTRSM_ILNNCOPY @@ -1921,6 +1960,11 @@ #define GEMM_KERNEL_R ZGEMM_KERNEL_R #define GEMM_KERNEL_B ZGEMM_KERNEL_B +#define COMM_KERNEL_N ZCOMM_KERNEL_N +#define COMM_KERNEL_L ZCOMM_KERNEL_L +#define COMM_KERNEL_R ZCOMM_KERNEL_R +#define COMM_KERNEL_B ZCOMM_KERNEL_B + #define GEMM3M_KERNEL ZGEMM3M_KERNEL #define TRMM_KERNEL_LN ZTRMM_KERNEL_LN @@ -2295,6 +2339,8 @@ #define GEMM_OTCOPY CGEMM_OTCOPY #define GEMM_INCOPY CGEMM_INCOPY #define GEMM_ITCOPY CGEMM_ITCOPY +#define COMM_NCOPY CCOMM_NCOPY +#define COMM_TCOPY CCOMM_TCOPY #define GEMM3M_ONCOPYB CGEMM3M_ONCOPYB #define GEMM3M_ONCOPYR CGEMM3M_ONCOPYR @@ -2324,6 +2370,7 @@ #define TRMM_IUTCOPY CTRMM_IUTUCOPY #define TRMM_ILNCOPY CTRMM_ILNUCOPY #define TRMM_ILTCOPY CTRMM_ILTUCOPY + #define TRSM_IUNCOPY CTRSM_IUNUCOPY #define TRSM_IUTCOPY CTRSM_IUTUCOPY #define TRSM_ILNCOPY CTRSM_ILNUCOPY @@ -2344,6 +2391,7 @@ #define TRMM_IUTCOPY CTRMM_IUTNCOPY #define TRMM_ILNCOPY CTRMM_ILNNCOPY #define TRMM_ILTCOPY CTRMM_ILTNCOPY + #define TRSM_IUNCOPY CTRSM_IUNNCOPY #define TRSM_IUTCOPY CTRSM_IUTNCOPY #define TRSM_ILNCOPY CTRSM_ILNNCOPY @@ -2386,6 +2434,11 @@ #define GEMM_KERNEL_R CGEMM_KERNEL_R #define GEMM_KERNEL_B CGEMM_KERNEL_B +#define COMM_KERNEL_N CCOMM_KERNEL_N +#define COMM_KERNEL_L CCOMM_KERNEL_L +#define COMM_KERNEL_R CCOMM_KERNEL_R +#define COMM_KERNEL_B CCOMM_KERNEL_B + #define GEMM3M_KERNEL CGEMM3M_KERNEL #define TRMM_KERNEL_LN CTRMM_KERNEL_LN diff --git a/common_param.h b/common_param.h index 92bde3b3d7..0cd10aec44 100644 --- a/common_param.h +++ b/common_param.h @@ -273,7 +273,7 @@ int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BL int (*sgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - + int (*scomm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int (*sgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -331,6 +331,8 @@ int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BL int (*strmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*scomm_ncopy)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*scomm_tcopy)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*strmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*strmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); @@ -395,6 +397,7 @@ int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BL #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*dcomm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); @@ -452,6 +455,8 @@ int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BL int (*dtrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*dcomm_ncopy)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*dcomm_tcopy)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dtrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*dtrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); @@ -614,6 +619,11 @@ int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BL int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); + int (*ccomm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*ccomm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*ccomm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*ccomm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); + int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -706,6 +716,8 @@ int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BL int (*ctrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); + int (*ccomm_ncopy)(BLASLONG, BLASLONG, float *, BLASLONG, float *); + int (*ccomm_tcopy)(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*ctrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ctrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); @@ -824,6 +836,11 @@ int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BL int (*zgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int (*zgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); + int (*zcomm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zcomm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zcomm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zcomm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); + int (*zgemm_incopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); @@ -916,6 +933,8 @@ int (*shgemv_t) (BLASLONG, BLASLONG, float, hfloat16 *, BLASLONG, hfloat16 *, BL int (*ztrmm_ilnncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iltucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_iltncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); + int (*zcomm_ncopy)(BLASLONG, BLASLONG, double *, BLASLONG, double *); + int (*zcomm_tcopy)(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*ztrmm_ounucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_ounncopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); int (*ztrmm_outucopy)(BLASLONG, BLASLONG, double *, BLASLONG, BLASLONG, BLASLONG, double *); diff --git a/common_s.h b/common_s.h index df61125f6e..a2003041f3 100644 --- a/common_s.h +++ b/common_s.h @@ -1,3 +1,42 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + #ifndef COMMON_S_H #define COMMON_S_H @@ -72,6 +111,9 @@ #define SGEMM_ITCOPY sgemm_itcopy #endif +#define SCOMM_NCOPY scomm_ncopy +#define SCOMM_TCOPY scomm_tcopy + #define STRMM_OUNUCOPY strmm_ounucopy #define STRMM_OUNNCOPY strmm_ounncopy #define STRMM_OUTUCOPY strmm_outucopy @@ -131,6 +173,7 @@ #define SGEMM_BETA sgemm_beta #define SGEMM_KERNEL sgemm_kernel +#define SCOMM_KERNEL scomm_kernel #define STRMM_KERNEL_LN strmm_kernel_LN #define STRMM_KERNEL_LT strmm_kernel_LT @@ -278,13 +321,17 @@ #define STRMM_IUTNCOPY gotoblas -> strmm_iutncopy #define STRMM_ILNNCOPY gotoblas -> strmm_ilnncopy #define STRMM_ILTNCOPY gotoblas -> strmm_iltncopy +#define SCOMM_NCOPY gotoblas -> scomm_ncopy +#define SCOMM_TCOPY gotoblas -> scomm_tcopy + #define STRSM_IUNNCOPY gotoblas -> strsm_iunncopy #define STRSM_IUTNCOPY gotoblas -> strsm_iutncopy #define STRSM_ILNNCOPY gotoblas -> strsm_ilnncopy #define STRSM_ILTNCOPY gotoblas -> strsm_iltncopy -#define SGEMM_BETA gotoblas -> sgemm_beta +#define SGEMM_BETA gotoblas -> sgemm_beta #define SGEMM_KERNEL gotoblas -> sgemm_kernel +#define SCOMM_KERNEL gotoblas -> scomm_kernel #define STRMM_KERNEL_LN gotoblas -> strmm_kernel_LN #define STRMM_KERNEL_LT gotoblas -> strmm_kernel_LT diff --git a/common_z.h b/common_z.h index c12d71b390..9908ceb49f 100644 --- a/common_z.h +++ b/common_z.h @@ -1,3 +1,42 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + #ifndef COMMON_Z_H #define COMMON_Z_H @@ -62,6 +101,9 @@ #define ZGEMM_ITCOPY zgemm_itcopy #endif +#define ZCOMM_NCOPY zcomm_ncopy +#define ZCOMM_TCOPY zcomm_tcopy + #define ZTRMM_OUNUCOPY ztrmm_ounucopy #define ZTRMM_OUNNCOPY ztrmm_ounncopy #define ZTRMM_OUTUCOPY ztrmm_outucopy @@ -125,6 +167,11 @@ #define ZGEMM_KERNEL_R zgemm_kernel_r #define ZGEMM_KERNEL_B zgemm_kernel_b +#define ZCOMM_KERNEL_N zcomm_kernel_n +#define ZCOMM_KERNEL_L zcomm_kernel_l +#define ZCOMM_KERNEL_R zcomm_kernel_r +#define ZCOMM_KERNEL_B zcomm_kernel_b + #define ZTRMM_KERNEL_LN ztrmm_kernel_LN #define ZTRMM_KERNEL_LT ztrmm_kernel_LT #define ZTRMM_KERNEL_LR ztrmm_kernel_LR @@ -320,6 +367,9 @@ #define ZTRMM_IUTNCOPY gotoblas -> ztrmm_iutncopy #define ZTRMM_ILNNCOPY gotoblas -> ztrmm_ilnncopy #define ZTRMM_ILTNCOPY gotoblas -> ztrmm_iltncopy +#define ZCOMM_NCOPY gotoblas -> zcomm_ncopy +#define ZCOMM_TCOPY gotoblas -> zcomm_tcopy + #define ZTRSM_IUNNCOPY gotoblas -> ztrsm_iunncopy #define ZTRSM_IUTNCOPY gotoblas -> ztrsm_iutncopy #define ZTRSM_ILNNCOPY gotoblas -> ztrsm_ilnncopy @@ -331,6 +381,11 @@ #define ZGEMM_KERNEL_R gotoblas -> zgemm_kernel_r #define ZGEMM_KERNEL_B gotoblas -> zgemm_kernel_b +#define ZCOMM_KERNEL_N gotoblas -> zcomm_kernel_n +#define ZCOMM_KERNEL_L gotoblas -> zcomm_kernel_l +#define ZCOMM_KERNEL_R gotoblas -> zcomm_kernel_r +#define ZCOMM_KERNEL_B gotoblas -> zcomm_kernel_b + #define ZTRMM_KERNEL_LN gotoblas -> ztrmm_kernel_LN #define ZTRMM_KERNEL_LT gotoblas -> ztrmm_kernel_LT #define ZTRMM_KERNEL_LR gotoblas -> ztrmm_kernel_LR diff --git a/driver/level3/symm_k.c b/driver/level3/symm_k.c index 567896a436..db2bc172f4 100644 --- a/driver/level3/symm_k.c +++ b/driver/level3/symm_k.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -57,6 +58,35 @@ #endif #endif + +#ifndef ICOPY_OPERATION +#if defined(NN) || defined(NT) || defined(NC) || defined(NR) || \ + defined(RN) || defined(RT) || defined(RC) || defined(RR) +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) COMM_TCOPY(M, N, (IFLOAT *)(A) + ((Y) + (X) * (LDA)) * COMPSIZE, LDA, BUFFER); +#else +#define ICOPY_OPERATION(M, N, A, LDA, X, Y, BUFFER) COMM_NCOPY(M, N, (IFLOAT *)(A) + ((X) + (Y) * (LDA)) * COMPSIZE, LDA, BUFFER); +#endif +#endif + + + + +//#ifndef KERNEL_FUNC +#if defined(NN) || defined(NT) || defined(TN) || defined(TT) +#define KERNEL_FUNC COMM_KERNEL_N +#endif +#if defined(CN) || defined(CT) || defined(RN) || defined(RT) +#define KERNEL_FUNC COMM_KERNEL_L +#endif +#if defined(NC) || defined(TC) || defined(NR) || defined(TR) +#define KERNEL_FUNC COMM_KERNEL_R +#endif +#if defined(CC) || defined(CR) || defined(RC) || defined(RR) +#define KERNEL_FUNC COMM_KERNEL_B +#endif +//#endif + + #ifndef RSIDE #define K args -> m #ifndef LOWER @@ -73,6 +103,7 @@ #endif #endif + #ifdef THREADED_LEVEL3 #include "level3_thread.c" #else diff --git a/driver/level3/trmm_L.c b/driver/level3/trmm_L.c index e25ea7afe8..7927532531 100644 --- a/driver/level3/trmm_L.c +++ b/driver/level3/trmm_L.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -43,11 +44,11 @@ const static FLOAT dp1 = 1.; #ifdef CONJ -#define GEMM_KERNEL GEMM_KERNEL_L +#define COMM_KERNEL COMM_KERNEL_L #define TRMM_KERNEL_N TRMM_KERNEL_LR #define TRMM_KERNEL_T TRMM_KERNEL_LC #else -#define GEMM_KERNEL GEMM_KERNEL_N +#define COMM_KERNEL COMM_KERNEL_N #define TRMM_KERNEL_N TRMM_KERNEL_LN #define TRMM_KERNEL_T TRMM_KERNEL_LT #endif @@ -206,9 +207,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO START_RPCC(); #ifndef TRANSA - GEMM_ITCOPY(min_l, min_i, a + (ls * lda) * COMPSIZE, lda, sa); + COMM_TCOPY(min_l, min_i, a + (ls * lda) * COMPSIZE, lda, sa); #else - GEMM_INCOPY(min_l, min_i, a + (ls ) * COMPSIZE, lda, sa); + COMM_NCOPY(min_l, min_i, a + (ls ) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); @@ -231,7 +232,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO START_RPCC(); - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + COMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -251,16 +252,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO START_RPCC(); #ifndef TRANSA - GEMM_ITCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); + COMM_TCOPY(min_l, min_i, a + (is + ls * lda) * COMPSIZE, lda, sa); #else - GEMM_INCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); + COMM_NCOPY(min_l, min_i, a + (ls + is * lda) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); START_RPCC(); - GEMM_KERNEL(min_i, min_j, min_l, dp1, + COMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -466,16 +467,16 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO START_RPCC(); #ifndef TRANSA - GEMM_ITCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); + COMM_TCOPY(min_l, min_i, a + (is + (ls - min_l) * lda) * COMPSIZE, lda, sa); #else - GEMM_INCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); + COMM_NCOPY(min_l, min_i, a + ((ls - min_l) + is * lda) * COMPSIZE, lda, sa); #endif STOP_RPCC(innercost); START_RPCC(); - GEMM_KERNEL(min_i, min_j, min_l, dp1, + COMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -492,7 +493,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO #ifdef TIMING total = (double)outercost + (double)innercost + (double)gemmcost + (double)trmmcost; - printf( "Copy A : %5.2f Copy B: %5.2f GEMM Kernel : %5.2f TRMM Kerlnel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", + printf( "Copy A : %5.2f Copy B: %5.2f GEMM Kernel : %5.2f TRMM Kernel : %5.2f kernel Effi. : %5.2f Total Effi. : %5.2f\n", innercost / total * 100., outercost / total * 100., gemmcost / total * 100., trmmcost / total * 100., (double)n * (double)n * (double)n / (double)(trmmcost + gemmcost) * 100. * (double)COMPSIZE / 2., diff --git a/driver/level3/trmm_R.c b/driver/level3/trmm_R.c index ab9cdfae8e..9c9975339b 100644 --- a/driver/level3/trmm_R.c +++ b/driver/level3/trmm_R.c @@ -1,5 +1,6 @@ /*********************************************************************/ /* Copyright 2009, 2010 The University of Texas at Austin. */ +/* Copyright 2025 The OpenBLAS Project. */ /* All rights reserved. */ /* */ /* Redistribution and use in source and binary forms, with or */ @@ -43,11 +44,11 @@ const static FLOAT dp1 = 1.; #ifdef CONJ -#define GEMM_KERNEL GEMM_KERNEL_R +#define COMM_KERNEL COMM_KERNEL_R #define TRMM_KERNEL_N TRMM_KERNEL_RR #define TRMM_KERNEL_T TRMM_KERNEL_RC #else -#define GEMM_KERNEL GEMM_KERNEL_N +#define COMM_KERNEL COMM_KERNEL_N #define TRMM_KERNEL_N TRMM_KERNEL_RN #define TRMM_KERNEL_T TRMM_KERNEL_RT #endif @@ -118,7 +119,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + COMM_TCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = 0; jjs < ls - js; jjs += min_jj){ min_jj = ls - js - jjs; @@ -136,7 +137,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO GEMM_OTCOPY(min_l, min_jj, a + ((js + jjs) + ls * lda) * COMPSIZE, lda, sb + min_l * jjs * COMPSIZE); #endif - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + COMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -173,9 +174,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + COMM_TCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - GEMM_KERNEL(min_i, ls - js, min_l, dp1, + COMM_KERNEL(min_i, ls - js, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -199,7 +200,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + COMM_TCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; @@ -217,7 +218,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO GEMM_OTCOPY(min_l, min_jj, a + (jjs + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + COMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -229,9 +230,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + COMM_TCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - GEMM_KERNEL(min_i, min_j, min_l, dp1, + COMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -254,7 +255,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + COMM_TCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = 0; jjs < min_l; jjs += min_jj){ min_jj = min_l - jjs; @@ -299,7 +300,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO sb + min_l * (min_l + jjs) * COMPSIZE); #endif - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + COMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -312,7 +313,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + COMM_TCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); TRMM_KERNEL_N(min_i, min_l, min_l, dp1, #ifdef COMPLEX @@ -323,7 +324,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO b + (is + ls * ldb) * COMPSIZE, ldb, 0); if (js - ls - min_l > 0) { - GEMM_KERNEL(min_i, js - ls - min_l, min_l, dp1, + COMM_KERNEL(min_i, js - ls - min_l, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -340,7 +341,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m; if (min_i > GEMM_P) min_i = GEMM_P; - GEMM_ITCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); + COMM_TCOPY(min_l, min_i, b + (ls * ldb) * COMPSIZE, ldb, sa); for(jjs = js; jjs < js + min_j; jjs += min_jj){ min_jj = min_j + js - jjs; @@ -358,7 +359,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO GEMM_OTCOPY(min_l, min_jj, a + ((jjs - min_j) + ls * lda) * COMPSIZE, lda, sb + min_l * (jjs - js) * COMPSIZE); #endif - GEMM_KERNEL(min_i, min_jj, min_l, dp1, + COMM_KERNEL(min_i, min_jj, min_l, dp1, #ifdef COMPLEX ZERO, #endif @@ -370,9 +371,9 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO min_i = m - is; if (min_i > GEMM_P) min_i = GEMM_P; - GEMM_ITCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); + COMM_TCOPY(min_l, min_i, b + (is + ls * ldb) * COMPSIZE, ldb, sa); - GEMM_KERNEL(min_i, min_j, min_l, dp1, + COMM_KERNEL(min_i, min_j, min_l, dp1, #ifdef COMPLEX ZERO, #endif diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 3a638376c0..75aa950734 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -314,8 +314,14 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type SINGLE DOUBLE) string(SUBSTRING ${float_type} 0 1 float_char) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) + if (${float_char}COMMKERNEL) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COMMKERNEL}" "" "comm_kernel" false "" "" false ${float_type}) + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "comm_kernel" false "" "" false ${float_type}) + endif () endforeach() - if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) + + if (BUILD_COMPLEX16 AND NOT BUILD_DOUBLE) GenerateNamedObjects("${KERNELDIR}/${DGEMMKERNEL}" "" "gemm_kernel" false "" "" false "DOUBLE") if (DGEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${DGEMMINCOPY}" "DOUBLE" "${DGEMMINCOPYOBJ}" false "" "" true "DOUBLE") @@ -482,10 +488,27 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) if (USE_TRMM) set(TRMM_KERNEL "${${float_char}TRMMKERNEL}") + elseif (${float_char}COMMKERNEL) + set(TRMM_KERNEL "${${float_char}COMMKERNEL}") else () set(TRMM_KERNEL "${${float_char}GEMMKERNEL}") endif () + if (${float_char}COMMNCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COMMNCOPY}" "${float_type}" "comm_ncopy" false "" "" false ${float_type}) + elseif (${float_char}GEMMINCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "comm_ncopy" false "" "" false ${float_type}) + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMONCOPY}" "${float_type}" "comm_ncopy" false "" "" false ${float_type}) + endif () + + if (${float_char}COMMTCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COMMTCOPY}" "${float_type}" "comm_tcopy" false "" "" false ${float_type}) + elseif (${float_char}GEMMITCOPY) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMITCOPY}" "${float_type}" "comm_tcopy" false "" "" false ${float_type}) + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMOTCOPY}" "${float_type}" "comm_tcopy" false "" "" false ${float_type}) + endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") @@ -496,6 +519,18 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NC" "gemm_kernel_r" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CC" "gemm_kernel_b" false "" "" false ${float_type}) + if (${float_char}COMMKERNEL) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COMMKERNEL}" "NN" "comm_kernel_n" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COMMKERNEL}" "CN" "comm_kernel_l" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COMMKERNEL}" "NC" "comm_kernel_r" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}COMMKERNEL}" "CC" "comm_kernel_b" false "" "" false ${float_type}) + else () + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NN" "comm_kernel_n" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CN" "comm_kernel_l" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "NC" "comm_kernel_r" false "" "" false ${float_type}) + GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "CC" "comm_kernel_b" false "" "" false ${float_type}) + endif () + GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;NN" "trmm_kernel_LN" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;TRANSA;NN" "trmm_kernel_LT" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${TRMM_KERNEL}" "TRMMKERNEL;LEFT;CONJ;CN" "trmm_kernel_LR" false "" "" false ${float_type}) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 6df9d78b1a..8563447568 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -124,6 +124,9 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif + + + ifdef USE_DIRECT_SGEMM ifndef SGEMMDIRECTKERNEL ifeq ($(ARCH), x86_64) @@ -233,6 +236,7 @@ endif ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ + scomm_kernel$(TSUFFIX).$(SUFFIX) \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ $(SGEMMINCOPYOBJ) $(SGEMMITCOPYOBJ) \ $(SGEMMONCOPYOBJ) $(SGEMMOTCOPYOBJ) @@ -284,6 +288,7 @@ ifneq "$(or $(BUILD_DOUBLE),$(BUILD_COMPLEX16))" "" DKERNELOBJS += \ dgemm_beta$(TSUFFIX).$(SUFFIX) \ dgemm_kernel$(TSUFFIX).$(SUFFIX) \ + dcomm_kernel$(TSUFFIX).$(SUFFIX) \ $(DGEMMINCOPYOBJ) $(DGEMMITCOPYOBJ) \ $(DGEMMONCOPYOBJ) $(DGEMMOTCOPYOBJ) endif @@ -297,6 +302,8 @@ ifneq "$(or $(BUILD_COMPLEX),$(BUILD_COMPLEX16))" "" CKERNELOBJS += \ cgemm_kernel_n$(TSUFFIX).$(SUFFIX) cgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ cgemm_kernel_l$(TSUFFIX).$(SUFFIX) cgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + ccomm_kernel_n$(TSUFFIX).$(SUFFIX) ccomm_kernel_r$(TSUFFIX).$(SUFFIX) \ + ccomm_kernel_l$(TSUFFIX).$(SUFFIX) ccomm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(CGEMMINCOPYOBJ) $(CGEMMITCOPYOBJ) \ $(CGEMMONCOPYOBJ) $(CGEMMOTCOPYOBJ) endif @@ -305,6 +312,8 @@ ifeq ($(BUILD_COMPLEX16),1) ZKERNELOBJS += \ zgemm_kernel_n$(TSUFFIX).$(SUFFIX) zgemm_kernel_r$(TSUFFIX).$(SUFFIX) \ zgemm_kernel_l$(TSUFFIX).$(SUFFIX) zgemm_kernel_b$(TSUFFIX).$(SUFFIX) \ + zcomm_kernel_n$(TSUFFIX).$(SUFFIX) zcomm_kernel_r$(TSUFFIX).$(SUFFIX) \ + zcomm_kernel_l$(TSUFFIX).$(SUFFIX) zcomm_kernel_b$(TSUFFIX).$(SUFFIX) \ $(ZGEMMINCOPYOBJ) $(ZGEMMITCOPYOBJ) \ $(ZGEMMONCOPYOBJ) $(ZGEMMOTCOPYOBJ) endif @@ -416,6 +425,7 @@ SBLASOBJS += \ strmm_ilnucopy$(TSUFFIX).$(SUFFIX) strmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ strmm_iutucopy$(TSUFFIX).$(SUFFIX) strmm_iutncopy$(TSUFFIX).$(SUFFIX) \ strmm_iltucopy$(TSUFFIX).$(SUFFIX) strmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + scomm_ncopy$(TSUFFIX).$(SUFFIX) scomm_tcopy$(TSUFFIX).$(SUFFIX) \ strmm_ounucopy$(TSUFFIX).$(SUFFIX) strmm_ounncopy$(TSUFFIX).$(SUFFIX) \ strmm_olnucopy$(TSUFFIX).$(SUFFIX) strmm_olnncopy$(TSUFFIX).$(SUFFIX) \ strmm_outucopy$(TSUFFIX).$(SUFFIX) strmm_outncopy$(TSUFFIX).$(SUFFIX) \ @@ -438,6 +448,7 @@ DBLASOBJS += \ dtrmm_ilnucopy$(TSUFFIX).$(SUFFIX) dtrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_iutucopy$(TSUFFIX).$(SUFFIX) dtrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_iltucopy$(TSUFFIX).$(SUFFIX) dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + dcomm_ncopy$(TSUFFIX).$(SUFFIX) dcomm_tcopy$(TSUFFIX).$(SUFFIX) \ dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) dtrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_olnucopy$(TSUFFIX).$(SUFFIX) dtrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ dtrmm_outucopy$(TSUFFIX).$(SUFFIX) dtrmm_outncopy$(TSUFFIX).$(SUFFIX) \ @@ -480,6 +491,7 @@ CBLASOBJS += \ ctrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ctrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_iutucopy$(TSUFFIX).$(SUFFIX) ctrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_iltucopy$(TSUFFIX).$(SUFFIX) ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + ccomm_ncopy$(TSUFFIX).$(SUFFIX) ccomm_tcopy$(TSUFFIX).$(SUFFIX) \ ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) ctrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_olnucopy$(TSUFFIX).$(SUFFIX) ctrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ ctrmm_outucopy$(TSUFFIX).$(SUFFIX) ctrmm_outncopy$(TSUFFIX).$(SUFFIX) \ @@ -507,6 +519,7 @@ ZBLASOBJS += \ ztrmm_ilnucopy$(TSUFFIX).$(SUFFIX) ztrmm_ilnncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_iutucopy$(TSUFFIX).$(SUFFIX) ztrmm_iutncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_iltucopy$(TSUFFIX).$(SUFFIX) ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) \ + zcomm_ncopy$(TSUFFIX).$(SUFFIX) zcomm_tcopy$(TSUFFIX).$(SUFFIX) \ ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) ztrmm_ounncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_olnucopy$(TSUFFIX).$(SUFFIX) ztrmm_olnncopy$(TSUFFIX).$(SUFFIX) \ ztrmm_outucopy$(TSUFFIX).$(SUFFIX) ztrmm_outncopy$(TSUFFIX).$(SUFFIX) \ @@ -892,7 +905,39 @@ ifeq ($(OS), AIX) else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif - + +endif + +ifdef SCOMMNCOPY +$(KDIR)scomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SCOMMNCOPY) +else ifdef SGEMMINCOPY +$(KDIR)scomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMINCOPY) +else +$(KDIR)scomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMONCOPY) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > scommncopy.s + $(M4_AIX) scommncopy.s > scommncopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX scommncopy_nomacros.s -o $@ + rm scommncopy.s scommncopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + +ifdef SCOMMTCOPY +$(KDIR)scomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SCOMMTCOPY) +else ifdef SGEMMITCOPY +$(KDIR)scomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMITCOPY) +else +$(KDIR)scomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMOTCOPY) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > scommtcopy.s + $(M4_AIX) scommtcopy.s > scommtcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX scommtcopy_nomacros.s -o $@ + rm scommtcopy.s scommtcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)$(DGEMMONCOPYOBJ) : $(KERNELDIR)/$(DGEMMONCOPY) @@ -1021,6 +1066,21 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif + +ifdef SCOMMKERNEL +$(KDIR)scomm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) $(SGEMMDEPEND) +else +$(KDIR)scomm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -UCOMPLEX $< -o - > scomm_kernel$(TSUFFIX).s + $(M4_AIX) scomm_kernel$(TSUFFIX).s > scomm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX scomm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm scomm_kernel$(TSUFFIX).s scomm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + ifdef USE_DIRECT_SGEMM ifeq ($(ARCH), x86_64) $(KDIR)sgemm_direct_performant$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTPERFORMANT) @@ -1074,6 +1134,20 @@ else $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ endif +ifdef DCOMMKERNEL +$(KDIR)dcomm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) $(DGEMMDEPEND) +else +$(KDIR)dcomm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dcomm_kernel$(TSUFFIX).s + $(M4_AIX) dcomm_kernel$(TSUFFIX).s > dcomm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dcomm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm dcomm_kernel$(TSUFFIX).s dcomm_kernel$(TSUFFIX)_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)qgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ @@ -1117,6 +1191,62 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ endif +ifdef CCOMMKERNEL +$(KDIR)ccomm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) $(CGEMMDEPEND) +else +$(KDIR)ccomm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNN $< -o - > ccomm_kernel_n.s + $(M4_AIX) ccomm_kernel_n.s > ccomm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN ccomm_kernel_n_nomacros.s -o $@ + rm ccomm_kernel_n.s ccomm_kernel_n_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ +endif + +ifdef CCOMMKERNEL +$(KDIR)ccomm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) $(CGEMMDEPEND) +else +$(KDIR)ccomm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCN $< -o - > ccomm_kernel_l.s + $(M4_AIX) ccomm_kernel_l.s > ccomm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN ccomm_kernel_l_nomacros.s -o $@ + rm ccomm_kernel_l.s ccomm_kernel_l_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ +endif + +ifdef CCOMMKERNEL +$(KDIR)ccomm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) $(CGEMMDEPEND) +else +$(KDIR)ccomm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > ccomm_kernel_r.s + $(M4_AIX) ccomm_kernel_r.s > ccomm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC ccomm_kernel_r_nomacros.s -o $@ + rm ccomm_kernel_r.s ccomm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif + +ifdef CCOMMKERNEL +$(KDIR)ccomm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) $(CGEMMDEPEND) +else +$(KDIR)ccomm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -UDOUBLE -DCOMPLEX -DCC $< -o - > ccomm_kernel_b.s + $(M4_AIX) ccomm_kernel_b.s > ccomm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC ccomm_kernel_b_nomacros.s -o $@ + rm ccomm_kernel_b.s ccomm_kernel_b_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ +endif + $(KDIR)zgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zgemm_kernel_n.s @@ -1165,6 +1295,70 @@ else $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ endif +ifdef ZCOMMKERNEL +$(KDIR)zcomm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) $(ZGEMMDEPEND) +else +$(KDIR)zcomm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNN $< -o - > zcomm_kernel_n.s + $(M4_AIX) zcomm_kernel_n.s > zcomm_kernel_n_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN zcomm_kernel_n_nomacros.s -o $@ + rm zcomm_kernel_n.s zcomm_kernel_n_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ +endif + +ifdef ZCOMMKERNEL +$(KDIR)zcomm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) $(ZGEMMDEPEND) +else +$(KDIR)zcomm_kernel_l$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCN $< -o - > zcomm_kernel_l.s + $(M4_AIX) zcomm_kernel_l.s > zcomm_kernel_l_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN zcomm_kernel_l_nomacros.s -o $@ + rm zcomm_kernel_l.s zcomm_kernel_l_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ +endif + +ifdef ZCOMMKERNEL +$(KDIR)zcomm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) $(ZGEMMDEPEND) +else +$(KDIR)zcomm_kernel_r$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DNC $< -o - > zcomm_kernel_r.s + $(M4_AIX) zcomm_kernel_r.s > zcomm_kernel_r_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC zcomm_kernel_r_nomacros.s -o $@ + rm zcomm_kernel_r.s zcomm_kernel_r_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ +endif + +ifdef ZCOMMKERNEL +$(KDIR)zcomm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) $(ZGEMMDEPEND) +else +$(KDIR)zcomm_kernel_b$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -DCOMPLEX -DCC $< -o - > zcomm_kernel_b.s + $(M4_AIX) zcomm_kernel_b.s > zcomm_kernel_b_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC zcomm_kernel_b_nomacros.s -o $@ + rm zcomm_kernel_b.s zcomm_kernel_b_nomacros.s +else ifeq ($(CORE),SANDYBRIDGE) + $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +else + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +endif + $(KDIR)xgemm_kernel_n$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -1479,17 +1673,34 @@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ endif +else + +ifdef SCOMMKERNEL +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +ifdef SCOMMKERNEL +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) +else $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +ifdef SCOMMKERNEL +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) +else $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +ifdef SCOMMKERNEL +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) +else $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +endif ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s $(M4_AIX) strmm_kernel_rt.s > strmm_kernel_rt_nomacros.s @@ -1499,16 +1710,32 @@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ endif +ifdef DCOMMKERNEL +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) +else $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +ifdef DCOMMKERNEL +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) +else $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +ifdef DCOMMKERNEL +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) +else $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +ifdef DCOMMKERNEL +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) +else $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) @@ -1523,74 +1750,144 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ endif +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ endif + +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ endif + +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ endif + +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ endif + +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ endif + +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ endif + +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif ifeq ($(CORE),SANDYBRIDGE) $(CC) $(filter-out -mavx,$(CFLAGS)) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ else @@ -1599,8 +1896,6 @@ endif endif - - $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ @@ -1886,6 +2181,39 @@ $(KDIR)dtrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/trmm_ltcopy_$(DGEMM_UNROLL_M $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ endif +ifdef DCOMMNCOPY +$(KDIR)dcomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DCOMMNCOPY) +else ifdef DGEMMINCOPY +$(KDIR)dcomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMINCOPY) +else +$(KDIR)dcomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMONCOPY) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dcomm_ncopy.s + $(M4_AIX) dcomm_ncopy.s > dcomm_ncopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dcomm_ncopy_nomacros.s -o $@ + rm dcomm_ncopy.s dcomm_ncopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif + +ifdef DCOMMTCOPY +$(KDIR)dcomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DCOMMTCOPY) +else ifdef DGEMMITCOPY +$(KDIR)dcomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMITCOPY) +else +$(KDIR)dcomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMOTCOPY) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > dcomm_tcopy.s + $(M4_AIX) dcomm_tcopy.s > dcomm_tcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX dcomm_tcopy_nomacros.s -o $@ + rm dcomm_tcopy.s dcomm_tcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif + + $(KDIR)dtrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/trmm_uncopy_$(DGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -UCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2014,6 +2342,31 @@ $(KDIR)ctrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(CGEMM_UNROLL_ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -UOUTER -ULOWER -UUNIT $< -o $@ endif +ifdef CCOMMNCOPY +$(KDIR)ccomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMNCOPY) +else ifdef CGEMMINCOPY +$(KDIR)ccomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMINCOPY) +else +$(KDIR)ccomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMONCOPY) +endif + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifdef CCOMMTCOPY +$(KDIR)ccomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CCOMMTCOPY) +else ifdef CGEMMITCOPY +$(KDIR)ccomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMITCOPY) +else +$(KDIR)ccomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMOTCOPY) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -UDOUBLE -UCOMPLEX -S $< -o - > ccomm_tcopy.s + $(M4_AIX) ccomm_tcopy.s > ccomm_tcopy_nomacros.s + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX ccomm_tcopy_nomacros.s -o $@ + rm ccomm_tcopy.s ccomm_tcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +endif + $(KDIR)ctrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(CGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -UDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -2094,6 +2447,32 @@ $(KDIR)ztrmm_iltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_ltcopy_$(ZGEMM_UNROLL_ $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -UOUTER -DLOWER -UUNIT $< -o $@ endif +ifdef ZCOMMNCOPY +$(KDIR)zcomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMNCOPY) +else ifdef ZGEMMINCOPY +$(KDIR)zcomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMINCOPY) +else +$(KDIR)zcomm_ncopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMONCOPY) +endif + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifdef ZCOMMTCOPY +$(KDIR)zcomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZCOMMTCOPY) +else ifdef ZGEMMITCOPY +$(KDIR)zcomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMITCOPY) +else +$(KDIR)zcomm_tcopy$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMOTCOPY) +endif +ifeq ($(OS), AIX) + $(CC) $(CFLAGS) -S -DDOUBLE -UCOMPLEX $< -o - > zcomm_tcopy.s + $(M4_AIX) zcomm_tcopy.s > zcomm_tcopy_nomacros.s + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX zcomm_tcopy_nomacros.s -o $@ + rm zcomm_tcopy.s zcomm_tcopy_nomacros.s +else + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +endif + + $(KDIR)ztrmm_ounucopy$(TSUFFIX).$(SUFFIX) : generic/ztrmm_uncopy_$(ZGEMM_UNROLL_N).c $(CC) -c $(CFLAGS) $(NO_UNINITIALIZED_WARN) -DDOUBLE -DCOMPLEX -DOUTER -ULOWER -DUNIT $< -o $@ @@ -3169,6 +3548,25 @@ $(SGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMITCOPY) endif +ifdef SCOMMNCOPY +$(KDIR)scomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOMMNCOPY) +else ifdef SGEMMINCOPY +$(KDIR)scomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMINCOPY) +else +$(KDIR)scomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMONCOPY) +endif + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifdef SCOMMTCOPY +$(KDIR)scomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOMMTCOPY) +else ifdef SGEMMITCOPY +$(KDIR)scomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMITCOPY) +else +$(KDIR)scomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMOTCOPY) +endif + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + + $(DGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMONCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -3185,6 +3583,24 @@ $(DGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(DGEMMITCOPY) endif +ifdef DCOMMNCOPY +$(KDIR)dcomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOMMNCOPY) +else ifdef DGEMMINCOPY +$(KDIR)dcomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMINCOPY) +else +$(KDIR)dcomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMONCOPY) +endif + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifdef DCOMMTCOPY +$(KDIR)dcomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOMMTCOPY) +else ifdef DGEMMITCOPY +$(KDIR)dcomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMITCOPY) +else +$(KDIR)dcomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMOTCOPY) +endif + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + ifdef EXPRECISION $(QGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(QGEMMONCOPY) @@ -3221,6 +3637,24 @@ $(CGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(CGEMMITCOPY) endif +ifdef CCOMMNCOPY +$(KDIR)ccomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMNCOPY) +else ifdef CGEMMINCOPY +$(KDIR)ccomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMINCOPY) +else +$(KDIR)ccomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMONCOPY) +endif + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + +ifdef CCOMMTCOPY +$(KDIR)ccomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMTCOPY) +else ifdef CGEMMITCOPY +$(KDIR)ccomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMITCOPY) +else +$(KDIR)ccomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMOTCOPY) +endif + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(ZGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMONCOPY) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -3237,6 +3671,24 @@ $(ZGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(ZGEMMITCOPY) endif +ifdef ZCOMMNCOPY +$(KDIR)zcomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMNCOPY) +else ifdef ZGEMMINCOPY +$(KDIR)zcomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMINCOPY) +else +$(KDIR)zcomm_ncopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMONCOPY) +endif + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + +ifdef ZCOMMTCOPY +$(KDIR)zcomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMTCOPY) +else ifdef ZGEMMITCOPY +$(KDIR)zcomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMITCOPY) +else +$(KDIR)zcomm_tcopy$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMOTCOPY) +endif + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + ifdef EXPRECISION $(XGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(XGEMMONCOPY) @@ -3273,9 +3725,23 @@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifdef SCOMMKERNEL +$(KDIR)scomm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) $(SGEMMDEPEND) +else +$(KDIR)scomm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)dgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ +ifdef DCOMMKERNEL +$(KDIR)dcomm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) $(DGEMMDEPEND) +else +$(KDIR)dcomm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ + $(KDIR)qgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(QGEMMDEPEND) $(CC) $(PFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $@ @@ -3298,6 +3764,41 @@ endif $(KDIR)cgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ccomm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) $(CGEMMDEPEND) +else +$(KDIR)ccomm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ + +ifdef CCOMMKERNEL +$(KDIR)ccomm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) $(CGEMMDEPEND) +else +$(KDIR)ccomm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCN $< -o $@ + +ifdef CCOMMKERNEL +$(KDIR)ccomm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) $(CGEMMDEPEND) +else +$(KDIR)ccomm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +endif +ifeq ($(OS), AIX) + $(CC) $(PFLAGS) -S -UDOUBLE -DCOMPLEX -DNC $< -o - > ccomm_kernel_r.s + $(M4_AIX) ccomm_kernel_r.s > ccomm_kernel_r_nomacros.s + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DNC ccomm_kernel_r_nomacros.s -o $@ + rm ccomm_kernel_r.s ccomm_kernel_r_nomacros.s +else + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNC $< -o $@ +endif + +ifdef CCOMMKERNEL +$(KDIR)ccomm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) $(CGEMMDEPEND) +else +$(KDIR)ccomm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(CGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -UDOUBLE -DCOMPLEX -DCC $< -o $@ + $(KDIR)zgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -3310,6 +3811,34 @@ $(KDIR)zgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMM $(KDIR)zgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)zcomm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) $(ZGEMMDEPEND) +else +$(KDIR)zcomm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNN $< -o $@ + +ifdef ZCOMMKERNEL +$(KDIR)zcomm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) $(ZGEMMDEPEND) +else +$(KDIR)zcomm_kernel_l$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCN $< -o $@ + +ifdef ZCOMMKERNEL +$(KDIR)zcomm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) $(ZGEMMDEPEND) +else +$(KDIR)zcomm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DNC $< -o $@ + +ifdef ZCOMMKERNEL +$(KDIR)zcomm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) $(ZGEMMDEPEND) +else +$(KDIR)zcomm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(ZGEMMDEPEND) +endif + $(CC) $(PFLAGS) -c -DDOUBLE -DCOMPLEX -DCC $< -o $@ + $(KDIR)xgemm_kernel_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DNN $< -o $@ @@ -3322,16 +3851,32 @@ $(KDIR)xgemm_kernel_r$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMM $(KDIR)xgemm_kernel_b$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(XGEMMDEPEND) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX -DCC $< -o $@ +ifdef SCOMMKERNEL +$(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) +else $(KDIR)strmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +ifdef SCOMMKERNEL +$(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) +else $(KDIR)strmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +ifdef SCOMMKERNEL +$(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) +else $(KDIR)strmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +ifdef SCOMMKERNEL +$(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SCOMMKERNEL) +else $(KDIR)strmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) +endif ifeq ($(OS), AIX) $(CC) $(CFLAGS) -S -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o - > strmm_kernel_rt.s $(M4_AIX) strmmkernel_rn.s > strmm_kernel_rt_nomacros.s @@ -3341,16 +3886,32 @@ else $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ endif +ifdef DCOMMKERNEL +$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) +else $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ +ifdef DCOMMKERNEL +$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) +else $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ +ifdef DCOMMKERNEL +$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) +else $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ +ifdef DCOMMKERNEL +$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DCOMMKERNEL) +else $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) @@ -3365,52 +3926,116 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(CC) $(PFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +ifdef CCOMMKERNEL +$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CCOMMKERNEL) +else $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ +ifdef ZCOMMKERNEL +$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZCOMMKERNEL) +else $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) +endif $(CC) $(PFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) diff --git a/kernel/arm64/KERNEL.ARMV8SVE b/kernel/arm64/KERNEL.ARMV8SVE index a8371a03cd..8457144b38 100644 --- a/kernel/arm64/KERNEL.ARMV8SVE +++ b/kernel/arm64/KERNEL.ARMV8SVE @@ -1,3 +1,30 @@ +############################################################################### +# Copyright (c) 2025, The OpenBLAS Project +# All rights reserved. +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# 3. Neither the name of the OpenBLAS project nor the names of +# its contributors may be used to endorse or promote products +# derived from this software without specific prior written permission. +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS +# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN +# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) +# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +############################################################################### CSUMKERNEL = csum_thunderx2t99.c ZSUMKERNEL = zsum_thunderx2t99.c @@ -123,11 +150,12 @@ DSDOTKERNEL = dot.S DGEMM_BETA = dgemm_beta.S SGEMM_BETA = sgemm_beta.S -SGEMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S +SGEMMKERNEL = sgemm_kernel_inter_sve_v2x$(SGEMM_UNROLL_N).S +SCOMMKERNEL = sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S STRMMKERNEL = strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S -SGEMMINCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c -SGEMMITCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c +SGEMMINCOPY = gemm_ncopy_sve_v2x$(SGEMM_UNROLL_N).c +SGEMMITCOPY = gemm_tcopy_sve_v2x$(SGEMM_UNROLL_N).c SGEMMONCOPY = sgemm_ncopy_$(SGEMM_UNROLL_N).S SGEMMOTCOPY = sgemm_tcopy_$(SGEMM_UNROLL_N).S @@ -150,15 +178,18 @@ STRMMUNCOPY_M = trmm_uncopy_sve_v1.c STRMMLNCOPY_M = trmm_lncopy_sve_v1.c STRMMUTCOPY_M = trmm_utcopy_sve_v1.c STRMMLTCOPY_M = trmm_ltcopy_sve_v1.c +SCOMMNCOPY = gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c +SCOMMTCOPY = gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c SSYMMUCOPY_M = symm_ucopy_sve.c SSYMMLCOPY_M = symm_lcopy_sve.c -DGEMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S +DGEMMKERNEL = dgemm_kernel_inter_sve_v2x$(DGEMM_UNROLL_N).S +DCOMMKERNEL = dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S DTRMMKERNEL = dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S -DGEMMINCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c -DGEMMITCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c +DGEMMINCOPY = gemm_ncopy_sve_v2x$(DGEMM_UNROLL_N).c +DGEMMITCOPY = gemm_tcopy_sve_v2x$(DGEMM_UNROLL_N).c DGEMMONCOPY = dgemm_ncopy_$(DGEMM_UNROLL_N).S DGEMMOTCOPY = dgemm_tcopy_$(DGEMM_UNROLL_N).S @@ -181,11 +212,14 @@ DTRMMUNCOPY_M = trmm_uncopy_sve_v1.c DTRMMLNCOPY_M = trmm_lncopy_sve_v1.c DTRMMUTCOPY_M = trmm_utcopy_sve_v1.c DTRMMLTCOPY_M = trmm_ltcopy_sve_v1.c +DCOMMNCOPY = gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c +DCOMMTCOPY = gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c DSYMMUCOPY_M = symm_ucopy_sve.c DSYMMLCOPY_M = symm_lcopy_sve.c CGEMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +CCOMMKERNEL = cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CTRMMKERNEL = ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S CGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c @@ -202,6 +236,8 @@ CTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c CTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c CTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c CTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c +CCOMMNCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +CCOMMTCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c CHEMMLTCOPY_M = zhemm_ltcopy_sve.c CHEMMUTCOPY_M = zhemm_utcopy_sve.c @@ -210,6 +246,7 @@ CSYMMUCOPY_M = zsymm_ucopy_sve.c CSYMMLCOPY_M = zsymm_lcopy_sve.c ZGEMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S +ZCOMMKERNEL = zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZTRMMKERNEL = ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S ZGEMMINCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c @@ -226,6 +263,8 @@ ZTRMMUNCOPY_M = ztrmm_uncopy_sve_v1.c ZTRMMLNCOPY_M = ztrmm_lncopy_sve_v1.c ZTRMMUTCOPY_M = ztrmm_utcopy_sve_v1.c ZTRMMLTCOPY_M = ztrmm_ltcopy_sve_v1.c +ZCOMMNCOPY = gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c +ZCOMMTCOPY = gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c ZHEMMLTCOPY_M = zhemm_ltcopy_sve.c ZHEMMUTCOPY_M = zhemm_utcopy_sve.c diff --git a/kernel/arm64/dgemm_kernel_inter_sve_v2x8.S b/kernel/arm64/dgemm_kernel_inter_sve_v2x8.S new file mode 100644 index 0000000000..8de5626e0f --- /dev/null +++ b/kernel/arm64/dgemm_kernel_inter_sve_v2x8.S @@ -0,0 +1,1521 @@ +/*******************************************************************************/ +/* Copyright (c) 2025, The OpenBLAS Project */ +/* All rights reserved. */ +/* Redistribution and use in source and binary forms, with or without */ +/* modification, are permitted provided that the following conditions are */ +/* met: */ +/* 1. Redistributions of source code must retain the above copyright */ +/* notice, this list of conditions and the following disclaimer. */ +/* 2. Redistributions in binary form must reproduce the above copyright */ +/* notice, this list of conditions and the following disclaimer in */ +/* the documentation and/or other materials provided with the */ +/* distribution. */ +/* 3. Neither the name of the OpenBLAS project nor the names of */ +/* its contributors may be used to endorse or promote products */ +/* derived from this software without specific prior written permission. */ +/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" */ +/* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE */ +/* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE */ +/* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE */ +/* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR */ +/* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF */ +/* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS */ +/* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ +/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE). */ +/* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/*******************************************************************************/ + +/* This is an SVE dgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse dgemm_n|tcopy_sve_v1.c packing functions. */ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha x18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 d10 +#define alphaZ z7.d + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 pA20_0 +//v03 pA20_1 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 + dup z24.d, #0 + dup z25.d, #0 + dup z26.d, #0 + dup z27.d, #0 + dup z28.d, #0 + dup z29.d, #0 + dup z30.d, #0 + dup z31.d, #0 +.endm + +.macro KERNELv2x8_I + ld1d z0.d, p0/z, [pA1] // start + ld1d z1.d, p0/z, [pA1, #1, MUL VL] // pA1 + vec + ld1d z2.d, p0/z, [pA1, #2, MUL VL] // pA1 + vec * 2 + ld1d z3.d, p0/z, [pA1, #3, MUL VL] // pA1 + vec * 3 + add pA1, pA1, vec_len, lsl #5 // pA1 = pA1 + vec_len * 8 * 2 * 2 + + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M1 + ld1d z2.d, p0/z, [pA1] + ld1d z3.d, p0/z, [pA1, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 * 2 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_M2 + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA1, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 * 2 + + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv2x8_E + fmla z16.d, p0/m, z2.d, z8.d + fmla z17.d, p0/m, z3.d, z8.d + fmla z18.d, p0/m, z2.d, z9.d + fmla z19.d, p0/m, z3.d, z9.d + fmla z20.d, p0/m, z2.d, z10.d + fmla z21.d, p0/m, z3.d, z10.d + fmla z22.d, p0/m, z2.d, z11.d + fmla z23.d, p0/m, z3.d, z11.d + fmla z24.d, p0/m, z2.d, z12.d + fmla z25.d, p0/m, z3.d, z12.d + fmla z26.d, p0/m, z2.d, z13.d + fmla z27.d, p0/m, z3.d, z13.d + fmla z28.d, p0/m, z2.d, z14.d + fmla z29.d, p0/m, z3.d, z14.d + fmla z30.d, p0/m, z2.d, z15.d + fmla z31.d, p0/m, z3.d, z15.d +.endm + +.macro KERNELv2x8_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA1, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 * 2 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d + fmla z24.d, p0/m, z0.d, z12.d + fmla z25.d, p0/m, z1.d, z12.d + fmla z26.d, p0/m, z0.d, z13.d + fmla z27.d, p0/m, z1.d, z13.d + fmla z28.d, p0/m, z0.d, z14.d + fmla z29.d, p0/m, z1.d, z14.d + fmla z30.d, p0/m, z0.d, z15.d + fmla z31.d, p0/m, z1.d, z15.d +.endm + +.macro SAVEv2x8 + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + + add pCRow2, pCRow1, LDC + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow1, pCRow2, LDC + ld1d z8.d, p0/z, [pCRow2] + ld1d z9.d, p0/z, [pCRow2, #1, mul vl] + fmla z8.d, p0/m, z24.d, alphaZ + fmla z9.d, p0/m, z25.d, alphaZ + st1d z8.d, p0, [pCRow2] + st1d z9.d, p0, [pCRow2, #1, mul vl] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z26.d, alphaZ + fmla z11.d, p0/m, z27.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z28.d, alphaZ + fmla z13.d, p0/m, z29.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z30.d, alphaZ + fmla z15.d, p0/m, z31.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv2x4_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA1, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 * 2 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d + fmla z20.d, p0/m, z0.d, z10.d + fmla z21.d, p0/m, z1.d, z10.d + fmla z22.d, p0/m, z0.d, z11.d + fmla z23.d, p0/m, z1.d, z11.d +.endm + +.macro SAVEv2x4 + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + + add pCRow2, pCRow1, LDC + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + + add pCRow1, pCRow2, LDC + ld1d z12.d, p0/z, [pCRow2] + ld1d z13.d, p0/z, [pCRow2, #1, mul vl] + fmla z12.d, p0/m, z20.d, alphaZ + fmla z13.d, p0/m, z21.d, alphaZ + st1d z12.d, p0, [pCRow2] + st1d z13.d, p0, [pCRow2, #1, mul vl] + + ld1d z14.d, p0/z, [pCRow1] + ld1d z15.d, p0/z, [pCRow1, #1, mul vl] + fmla z14.d, p0/m, z22.d, alphaZ + fmla z15.d, p0/m, z23.d, alphaZ + st1d z14.d, p0, [pCRow1] + st1d z15.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv2x2 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv2x2_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA1, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 * 2 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d + fmla z18.d, p0/m, z0.d, z9.d + fmla z19.d, p0/m, z1.d, z9.d +.endm + +.macro SAVEv2x2 + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + + ld1d z10.d, p0/z, [pCRow1] + ld1d z11.d, p0/z, [pCRow1, #1, mul vl] + fmla z10.d, p0/m, z18.d, alphaZ + fmla z11.d, p0/m, z19.d, alphaZ + st1d z10.d, p0, [pCRow1] + st1d z11.d, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 +.endm + +.macro INITv2x1 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv2x1_SUB + ld1d z0.d, p0/z, [pA1] + ld1d z1.d, p0/z, [pA1, vec_len, lsl #3] + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 8 * 2 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p0/m, z0.d, z8.d + fmla z17.d, p0/m, z1.d, z8.d +.endm + +.macro SAVEv2x1 + add pCRow1, pCRow0, LDC + ld1d z8.d, p0/z, [pCRow0] + ld1d z9.d, p0/z, [pCRow0, #1, mul vl] + fmla z8.d, p0/m, z16.d, alphaZ + fmla z9.d, p0/m, z17.d, alphaZ + st1d z8.d, p0, [pCRow0] + st1d z9.d, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #4 // pC = pC + vec_len * 8 * 2 + +.endm + +.macro INITv1x8 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 + dup z20.d, #0 + dup z21.d, #0 + dup z22.d, #0 + dup z23.d, #0 +.endm + +.macro KERNELv1x8_I + ld1d z0.d, p1/z, [pA1] + ld1d z1.d, p1/z, [pA1, lanes, lsl #3] // next one + add pA1, pA1, lanes, lsl #4 // pA1 = pA1 + lanes * 2 * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M1 + ld1d z1.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z0.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z0.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z0.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z0.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z0.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z0.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z0.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z0.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_M2 + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + fmla z16.d, p1/m, z1.d, z8.d + ld1rd z8.d, p0/z, [pB] + fmla z17.d, p1/m, z1.d, z9.d + ld1rd z9.d, p0/z, [pB, 8] + fmla z18.d, p1/m, z1.d, z10.d + ld1rd z10.d, p0/z, [pB, 16] + fmla z19.d, p1/m, z1.d, z11.d + ld1rd z11.d, p0/z, [pB, 24] + fmla z20.d, p1/m, z1.d, z12.d + ld1rd z12.d, p0/z, [pB, 32] + fmla z21.d, p1/m, z1.d, z13.d + ld1rd z13.d, p0/z, [pB, 40] + fmla z22.d, p1/m, z1.d, z14.d + ld1rd z14.d, p0/z, [pB, 48] + fmla z23.d, p1/m, z1.d, z15.d + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 +.endm + +.macro KERNELv1x8_E + fmla z16.d, p1/m, z1.d, z8.d + fmla z17.d, p1/m, z1.d, z9.d + fmla z18.d, p1/m, z1.d, z10.d + fmla z19.d, p1/m, z1.d, z11.d + fmla z20.d, p1/m, z1.d, z12.d + fmla z21.d, p1/m, z1.d, z13.d + fmla z22.d, p1/m, z1.d, z14.d + fmla z23.d, p1/m, z1.d, z15.d +.endm + +.macro KERNELv1x8_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + ld1rd z12.d, p0/z, [pB, 32] + ld1rd z13.d, p0/z, [pB, 40] + ld1rd z14.d, p0/z, [pB, 48] + ld1rd z15.d, p0/z, [pB, 56] + + add pB, pB, 64 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + fmla z20.d, p1/m, z0.d, z12.d + fmla z21.d, p1/m, z0.d, z13.d + fmla z22.d, p1/m, z0.d, z14.d + fmla z23.d, p1/m, z0.d, z15.d + + +.endm + +.macro SAVEv1x8 + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z28.d, p1/z, [pCRow2] + fmla z28.d, p1/m, z20.d, alphaZ + st1d z28.d, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1d z29.d, p1/z, [pCRow1] + fmla z29.d, p1/m, z21.d, alphaZ + st1d z29.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z30.d, p1/z, [pCRow2] + fmla z30.d, p1/m, z22.d, alphaZ + st1d z30.d, p1, [pCRow2] + + ld1d z31.d, p1/z, [pCRow1] + fmla z31.d, p1/m, z23.d, alphaZ + st1d z31.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.d, #0 + dup z17.d, #0 + dup z18.d, #0 + dup z19.d, #0 +.endm + +.macro KERNELv1x4_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + ld1rd z10.d, p0/z, [pB, 16] + ld1rd z11.d, p0/z, [pB, 24] + + add pB, pB, 32 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + fmla z18.d, p1/m, z0.d, z10.d + fmla z19.d, p1/m, z0.d, z11.d + +.endm + +.macro SAVEv1x4 + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1d z26.d, p1/z, [pCRow2] + fmla z26.d, p1/m, z18.d, alphaZ + st1d z26.d, p1, [pCRow2] + + ld1d z27.d, p1/z, [pCRow1] + fmla z27.d, p1/m, z19.d, alphaZ + st1d z27.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.d, #0 + dup z17.d, #0 +.endm + +.macro KERNELv1x2_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + ld1rd z9.d, p0/z, [pB, 8] + + add pB, pB, 16 + + fmla z16.d, p1/m, z0.d, z8.d + fmla z17.d, p1/m, z0.d, z9.d + +.endm + +.macro SAVEv1x2 + add pCRow1, pCRow0, LDC + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + ld1d z25.d, p1/z, [pCRow1] + fmla z25.d, p1/m, z17.d, alphaZ + st1d z25.d, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.d, #0 +.endm + +.macro KERNELv1x1_SUB + ld1d z0.d, p1/z, [pA1] + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 8 + + ld1rd z8.d, p0/z, [pB] + + add pB, pB, 8 + + fmla z16.d, p1/m, z0.d, z8.d + +.endm + +.macro SAVEv1x1 + ld1d z24.d, p1/z, [pCRow0] + fmla z24.d, p1/m, z16.d, alphaZ + st1d z24.d, p1, [pCRow0] + + add pCRow0, pCRow0, lanes, lsl #3 // pC = pC + lanes * 8 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + + fmov alpha, d0 + dup alphaZ, alpha + cntd vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #3 // ldc = ldc * 8 + ptrue p0.d // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Ldgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Ldgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN + blt .Ldgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ + + .align 5 +.Ldgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv2_22a + + .align 5 +.Ldgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv2_22 + + .align 5 +.Ldgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Ldgemm_kernel_L8_Mv2_44 + + .align 5 +.Ldgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Ldgemm_kernel_L8_Mv2_44 + +.Ldgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Ldgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv2_100 + + .align 5 +.Ldgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv2_46 + +.Ldgemm_kernel_L8_Mv2_100: + + SAVEv2x8 + +.Ldgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L8_END + +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. +.Ldgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Ldgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Ldgemm_kernel_L8_Mv1_22a + + .align 5 +.Ldgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L8_Mv1_22 + + .align 5 +.Ldgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Ldgemm_kernel_L8_Mv1_44 + + .align 5 +.Ldgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Ldgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Ldgemm_kernel_L8_Mv1_44 + +.Ldgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Ldgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L8_Mv1_100 + + .align 5 +.Ldgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L8_Mv1_46 + +.Ldgemm_kernel_L8_Mv1_100: + + SAVEv1x8 + +.Ldgemm_kernel_L8_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + b.any .Ldgemm_kernel_L8_Mv1_20 + +.Ldgemm_kernel_L8_END: + + lsl temp, origK, #6 + add origPB, origPB, temp // B = B + K * 8 * 8 + + subs counterJ, counterJ , #1 // j-- + bgt .Ldgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Ldgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Ldgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + .align 5 +.Ldgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv2_44 + + .align 5 +.Ldgemm_kernel_L4_Mv2_22: + + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv2_22 + +.Ldgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv2_100 + + .align 5 +.Ldgemm_kernel_L4_Mv2_46: + + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv2_46 + +.Ldgemm_kernel_L4_Mv2_100: + + SAVEv2x4 + +.Ldgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L4_END + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. +.Ldgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Ldgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L4_Mv1_44 + + .align 5 +.Ldgemm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L4_Mv1_22 + +.Ldgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L4_Mv1_100 + + .align 5 +.Ldgemm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L4_Mv1_46 + +.Ldgemm_kernel_L4_Mv1_100: + + SAVEv1x4 + +.Ldgemm_kernel_L4_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L4_Mv1_20 + + +.Ldgemm_kernel_L4_END: + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 4 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Ldgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Ldgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + .align 5 +.Ldgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv2_44 + + .align 5 +.Ldgemm_kernel_L2_Mv2_22: + + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv2_22 + +.Ldgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv2_100 + + .align 5 +.Ldgemm_kernel_L2_Mv2_46: + + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv2_46 + +.Ldgemm_kernel_L2_Mv2_100: + + SAVEv2x2 + +.Ldgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L2_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. +.Ldgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Ldgemm_kernel_L2_Mv1_44 + + .align 5 +.Ldgemm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L2_Mv1_22 + +.Ldgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L2_Mv1_100 + + .align 5 +.Ldgemm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Ldgemm_kernel_L2_Mv1_46 + +.Ldgemm_kernel_L2_Mv1_100: + + SAVEv1x2 + +.Ldgemm_kernel_L2_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L2_Mv1_20 + + +.Ldgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #4 // B = B + K * 2 * 8 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Ldgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Ldgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Ldgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Ldgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + .align 5 +.Ldgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv2_44 + + .align 5 +.Ldgemm_kernel_L1_Mv2_22: + + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_22 + +.Ldgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv2_100 + + .align 5 +.Ldgemm_kernel_L1_Mv2_46: + + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv2_46 + +.Ldgemm_kernel_L1_Mv2_100: + + SAVEv2x1 + +.Ldgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Ldgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Ldgemm_kernel_L1_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. +.Ldgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + + .align 5 +.Ldgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Ldgemm_kernel_L1_Mv1_44 + + .align 5 +.Ldgemm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_22 + +.Ldgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Ldgemm_kernel_L1_Mv1_100 + + .align 5 +.Ldgemm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Ldgemm_kernel_L1_Mv1_46 + +.Ldgemm_kernel_L1_Mv1_100: + + SAVEv1x1 + +.Ldgemm_kernel_L1_Mv1_END: + + incd counterI + whilelt p1.d, counterI, origM //SVE instruction + cntp lanes, p0, p1.d + b.any .Ldgemm_kernel_L1_Mv1_20 + + +.Ldgemm_kernel_L1_END: + +/******************************************************************************/ + +.Ldgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/arm64/gemm_ncopy_sve_v2x8.c b/kernel/arm64/gemm_ncopy_sve_v2x8.c new file mode 100644 index 0000000000..8130889d47 --- /dev/null +++ b/kernel/arm64/gemm_ncopy_sve_v2x8.c @@ -0,0 +1,180 @@ +/*******************************************************************************/ +/* Copyright (c) 2023, 2025, The OpenBLAS Project */ +/* All rights reserved. */ +/* Redistribution and use in source and binary forms, with or without */ +/* modification, are permitted provided that the following conditions are */ +/* met: */ +/* 1. Redistributions of source code must retain the above copyright */ +/* notice, this list of conditions and the following disclaimer. */ +/* 2. Redistributions in binary form must reproduce the above copyright */ +/* notice, this list of conditions and the following disclaimer in */ +/* the documentation and/or other materials provided with the */ +/* distribution. */ +/* 3. Neither the name of the OpenBLAS project nor the names of */ +/* its contributors may be used to endorse or promote products */ +/* derived from this software without specific prior written permission. */ +/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" */ +/* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE */ +/* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE */ +/* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE */ +/* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR */ +/* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF */ +/* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS */ +/* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ +/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE). */ +/* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/*******************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_INDEX svuint64_t +#define SV_INDEXER svindex_u64 +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#define SV_PREFETCH svprfd_gather_index +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_INDEX svuint32_t +#define SV_INDEXER svindex_u32 +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#define SV_PREFETCH svprfw_gather_index +#endif + +#define INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \ + svst1(pg, b_offset, a_vec); \ + b_offset += active; \ + a_vec = svld1_gather_index(pg, a_offset_inner + sve_size * lda, lda_vec); \ + svst1(pg, b_offset, a_vec); \ + b_offset += active; \ + a_offset_inner++; + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld1_gather_index(pg, a_offset_inner, lda_vec); \ + svst1(pg, b_offset, a_vec); \ + a_offset_inner++; \ + b_offset += active; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) { + uint64_t sve_size; + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_INDEX lda_vec = SV_INDEXER(0LL, lda); + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + uint64_t double_sve_size = sve_size * 2; + BLASLONG double_vectors_n = n & -double_sve_size; + for (BLASLONG j = 0; j < double_vectors_n; j += double_sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 3; + while (i_cnt--) { + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 4) { + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + // Next two rows + a_offset += double_sve_size * lda; + } + + BLASLONG single_vectors_n = (n - double_vectors_n) & -sve_size; + if (single_vectors_n) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 3; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 4) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size * lda; + } + + BLASLONG remaining_n = n - double_vectors_n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} diff --git a/kernel/arm64/gemm_tcopy_sve_v2x8.c b/kernel/arm64/gemm_tcopy_sve_v2x8.c new file mode 100644 index 0000000000..7f350ed67b --- /dev/null +++ b/kernel/arm64/gemm_tcopy_sve_v2x8.c @@ -0,0 +1,175 @@ +/*******************************************************************************/ +/* Copyright (c) 2023, 2025, The OpenBLAS Project */ +/* All rights reserved. */ +/* Redistribution and use in source and binary forms, with or without */ +/* modification, are permitted provided that the following conditions are */ +/* met: */ +/* 1. Redistributions of source code must retain the above copyright */ +/* notice, this list of conditions and the following disclaimer. */ +/* 2. Redistributions in binary form must reproduce the above copyright */ +/* notice, this list of conditions and the following disclaimer in */ +/* the documentation and/or other materials provided with the */ +/* distribution. */ +/* 3. Neither the name of the OpenBLAS project nor the names of */ +/* its contributors may be used to endorse or promote products */ +/* derived from this software without specific prior written permission. */ +/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" */ +/* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE */ +/* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE */ +/* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE */ +/* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR */ +/* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF */ +/* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS */ +/* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ +/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE). */ +/* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/*******************************************************************************/ + +#include +#include +#include + +#include "common.h" + +#ifdef DOUBLE +#define COUNT "cntd" +#define SV_TYPE svfloat64_t +#define SV_TRUE svptrue_b64 +#define SV_WHILE svwhilelt_b64 +#else +#define COUNT "cntw" +#define SV_TYPE svfloat32_t +#define SV_TRUE svptrue_b32 +#define SV_WHILE svwhilelt_b32 +#endif + +#define INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld1(pg, a_offset_inner); \ + svst1(pg, b_offset, a_vec); \ + b_offset += active; \ + a_vec = svld1(pg, a_offset_inner + sve_size); \ + svst1(pg, b_offset, a_vec); \ + a_offset_inner += lda; \ + b_offset += active; + +#define INNER_COPY(pg, a_offset_inner, b_offset, lda, active) \ + a_vec = svld1(pg, a_offset_inner); \ + svst1(pg, b_offset, a_vec); \ + a_offset_inner += lda; \ + b_offset += active; + +int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b){ + uint64_t sve_size = svcntw(); + asm(COUNT" %[SIZE_]" : [SIZE_] "=r" (sve_size) : : ); + + IFLOAT *a_offset, *a_offset_inner, *b_offset; + a_offset = a; + b_offset = b; + + SV_TYPE a_vec; + svbool_t pg_true = SV_TRUE(); + + uint64_t double_sve_size = sve_size * 2; + BLASLONG double_vectors_n = n & -double_sve_size; + for (BLASLONG j = 0; j < double_vectors_n; j += double_sve_size) { + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 3; + while (i_cnt--) { + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 4) { + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_INTERLEAVE_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + // Next two rows + a_offset += double_sve_size; + } + + BLASLONG single_vectors_n = (n - double_vectors_n) & -sve_size; + if (single_vectors_n) { + + a_offset_inner = a_offset; + + svbool_t pg = pg_true; + uint64_t active = sve_size; + uint64_t i_cnt = m >> 3; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 4) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + a_offset += sve_size; + } + + BLASLONG remaining_n = n - double_vectors_n - single_vectors_n; + if (remaining_n) { + a_offset_inner = a_offset; + svbool_t pg = SV_WHILE((uint64_t)0L, (uint64_t)remaining_n); + uint64_t active = remaining_n; + uint64_t i_cnt = m >> 2; + while (i_cnt--) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 2) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + + if (m & 1) { + INNER_COPY(pg, a_offset_inner, b_offset, lda, active); + } + } + + return 0; +} + diff --git a/kernel/arm64/sgemm_kernel_inter_sve_v2x8.S b/kernel/arm64/sgemm_kernel_inter_sve_v2x8.S new file mode 100644 index 0000000000..2ea1850fe9 --- /dev/null +++ b/kernel/arm64/sgemm_kernel_inter_sve_v2x8.S @@ -0,0 +1,1519 @@ +/*******************************************************************************/ +/* Copyright (c) 2025, The OpenBLAS Project */ +/* All rights reserved. */ +/* Redistribution and use in source and binary forms, with or without */ +/* modification, are permitted provided that the following conditions are */ +/* met: */ +/* 1. Redistributions of source code must retain the above copyright */ +/* notice, this list of conditions and the following disclaimer. */ +/* 2. Redistributions in binary form must reproduce the above copyright */ +/* notice, this list of conditions and the following disclaimer in */ +/* the documentation and/or other materials provided with the */ +/* distribution. */ +/* 3. Neither the name of the OpenBLAS project nor the names of */ +/* its contributors may be used to endorse or promote products */ +/* derived from this software without specific prior written permission. */ +/* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" */ +/* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE */ +/* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE */ +/* ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE */ +/* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR */ +/* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF */ +/* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS */ +/* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN */ +/* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE). */ +/* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/*******************************************************************************/ + +/* This is an SVE sgemm kernel with size 2*SVE_LEN x 8. +However, the data layout is the same as for the kernel 1*SVE_LEN x 8. +This means that we sweep two panels of packed A when iterating in a loop over K. +With this approach, we can reuse sgemm_n|tcopy_sve_v1.c packing functions. */ + +#define ASSEMBLER +#include "common.h" + +/* X0 X1 X2 s0 X3 x4 x5 x6 */ +/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha0,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc )*/ + +#define origM x0 +#define origN x1 +#define origK x2 +#define origPA x3 +#define origPB x4 +#define pC x5 +#define LDC x6 +#define temp x7 +#define counterL x8 +#define counterI x9 +#define counterJ x10 +#define pB x11 +#define pCRow0 x12 +#define pCRow1 x13 +#define pCRow2 x14 + +#define lanes x15 +#define pA1 x16 +#define pA2 x17 +#define alpha w18 +#define vec_len x19 +#define vec_lenx2 x20 + +#define alpha0 s10 +#define alphaZ z7.s + +#define A_PRE_SIZE 1536 +#define B_PRE_SIZE 512 +#define C_PRE_SIZE 128 + +// 00 origM +// 01 origN +// 02 origK +// 03 origPA +// 04 origPB +// 05 pC +// 06 origLDC -> LDC +// 07 temp +// 08 counterL +// 09 counterI +// 10 counterJ +// 11 pB +// 12 pCRow0 +// 13 pCRow1 +// 14 pCRow2 +// 15 lanes +// 16 pA1 +// 17 pA1 +// 18 must save alpha +// 19 must save vec_len +// 20 must save +// 21 must save +// 22 must save +// 23 must save +// 24 must save +// 25 must save +// 26 must save +// 27 must save +// 28 must save +// 29 frame +// 30 link +// 31 sp + +//v00 ALPHA -> pA10_0 +//v01 pA10_1 +//v02 pA20_0 +//v03 pA20_1 +//v04 +//v05 +//v06 +//v07 ALPHA0 +//v08 must save pB0_0 +//v09 must save pB0_1 +//v10 must save pB0_2 +//v11 must save pB0_3 +//v12 must save pB0_4 +//v13 must save pB0_5 +//v14 must save pB0_6 +//v15 must save pB0_7 +//v16 must save C0 +//v17 must save C1 +//v18 must save C2 +//v19 must save C3 +//v20 must save C4 +//v21 must save C5 +//v22 must save C6 +//v23 must save C7 +//v24 must save C8 +//v25 must save C9 +//v26 must save C10 +//v27 must save C11 +//v28 must save C12 +//v29 must save C13 +//v30 must save C14 +//v31 must save C15 + +/******************************************************************************* +* Macro definitions +*******************************************************************************/ + +.macro INITv2x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 + dup z24.s, #0 + dup z25.s, #0 + dup z26.s, #0 + dup z27.s, #0 + dup z28.s, #0 + dup z29.s, #0 + dup z30.s, #0 + dup z31.s, #0 +.endm + +.macro KERNELv2x8_I + ld1w z0.s, p0/z, [pA1] // start + ld1w z1.s, p0/z, [pA1, #1, MUL VL] // pA1 + vec + ld1w z2.s, p0/z, [pA1, #2, MUL VL] // pA1 + vec * 2 + ld1w z3.s, p0/z, [pA1, #3, MUL VL] // pA1 + vec * 3 + add pA1, pA1, vec_len, lsl #4 // pA1 = pA1 + vec_len * 4 * 2 * 2 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_M1 + ld1w z2.s, p0/z, [pA1] + ld1w z3.s, p0/z, [pA1, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 * 2 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_M2 + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA1, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 * 2 + + fmla z16.s, p0/m, z2.s, z8.s + fmla z17.s, p0/m, z3.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z18.s, p0/m, z2.s, z9.s + fmla z19.s, p0/m, z3.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z20.s, p0/m, z2.s, z10.s + fmla z21.s, p0/m, z3.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z22.s, p0/m, z2.s, z11.s + fmla z23.s, p0/m, z3.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z24.s, p0/m, z2.s, z12.s + fmla z25.s, p0/m, z3.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + fmla z26.s, p0/m, z2.s, z13.s + fmla z27.s, p0/m, z3.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z28.s, p0/m, z2.s, z14.s + fmla z29.s, p0/m, z3.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z30.s, p0/m, z2.s, z15.s + fmla z31.s, p0/m, z3.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv2x8_E + fmla z16.s, p0/m, z2.s, z8.s + fmla z17.s, p0/m, z3.s, z8.s + fmla z18.s, p0/m, z2.s, z9.s + fmla z19.s, p0/m, z3.s, z9.s + fmla z20.s, p0/m, z2.s, z10.s + fmla z21.s, p0/m, z3.s, z10.s + fmla z22.s, p0/m, z2.s, z11.s + fmla z23.s, p0/m, z3.s, z11.s + fmla z24.s, p0/m, z2.s, z12.s + fmla z25.s, p0/m, z3.s, z12.s + fmla z26.s, p0/m, z2.s, z13.s + fmla z27.s, p0/m, z3.s, z13.s + fmla z28.s, p0/m, z2.s, z14.s + fmla z29.s, p0/m, z3.s, z14.s + fmla z30.s, p0/m, z2.s, z15.s + fmla z31.s, p0/m, z3.s, z15.s +.endm + +.macro KERNELv2x8_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA1, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 * 2 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s + fmla z24.s, p0/m, z0.s, z12.s + fmla z25.s, p0/m, z1.s, z12.s + fmla z26.s, p0/m, z0.s, z13.s + fmla z27.s, p0/m, z1.s, z13.s + fmla z28.s, p0/m, z0.s, z14.s + fmla z29.s, p0/m, z1.s, z14.s + fmla z30.s, p0/m, z0.s, z15.s + fmla z31.s, p0/m, z1.s, z15.s +.endm + +.macro SAVEv2x8 + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z20.s, alphaZ + fmla z13.s, p0/m, z21.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + + add pCRow2, pCRow1, LDC + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z22.s, alphaZ + fmla z15.s, p0/m, z23.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow1, pCRow2, LDC + ld1w z8.s, p0/z, [pCRow2] + ld1w z9.s, p0/z, [pCRow2, #1, mul vl] + fmla z8.s, p0/m, z24.s, alphaZ + fmla z9.s, p0/m, z25.s, alphaZ + st1w z8.s, p0, [pCRow2] + st1w z9.s, p0, [pCRow2, #1, mul vl] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z26.s, alphaZ + fmla z11.s, p0/m, z27.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z28.s, alphaZ + fmla z13.s, p0/m, z29.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z30.s, alphaZ + fmla z15.s, p0/m, z31.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv2x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv2x4_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA1, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 * 2 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s + fmla z20.s, p0/m, z0.s, z10.s + fmla z21.s, p0/m, z1.s, z10.s + fmla z22.s, p0/m, z0.s, z11.s + fmla z23.s, p0/m, z1.s, z11.s +.endm + +.macro SAVEv2x4 + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + + add pCRow2, pCRow1, LDC + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + + add pCRow1, pCRow2, LDC + ld1w z12.s, p0/z, [pCRow2] + ld1w z13.s, p0/z, [pCRow2, #1, mul vl] + fmla z12.s, p0/m, z20.s, alphaZ + fmla z13.s, p0/m, z21.s, alphaZ + st1w z12.s, p0, [pCRow2] + st1w z13.s, p0, [pCRow2, #1, mul vl] + + ld1w z14.s, p0/z, [pCRow1] + ld1w z15.s, p0/z, [pCRow1, #1, mul vl] + fmla z14.s, p0/m, z22.s, alphaZ + fmla z15.s, p0/m, z23.s, alphaZ + st1w z14.s, p0, [pCRow1] + st1w z15.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv2x2 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv2x2_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA1, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 * 2 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s + fmla z18.s, p0/m, z0.s, z9.s + fmla z19.s, p0/m, z1.s, z9.s +.endm + +.macro SAVEv2x2 + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + + ld1w z10.s, p0/z, [pCRow1] + ld1w z11.s, p0/z, [pCRow1, #1, mul vl] + fmla z10.s, p0/m, z18.s, alphaZ + fmla z11.s, p0/m, z19.s, alphaZ + st1w z10.s, p0, [pCRow1] + st1w z11.s, p0, [pCRow1, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 +.endm + +.macro INITv2x1 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv2x1_SUB + ld1w z0.s, p0/z, [pA1] + ld1w z1.s, p0/z, [pA1, vec_len, lsl #2] + add pA1, pA1, vec_len, lsl #3 // pA1 = pA1 + vec_len * 4 * 2 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p0/m, z0.s, z8.s + fmla z17.s, p0/m, z1.s, z8.s +.endm + +.macro SAVEv2x1 + add pCRow1, pCRow0, LDC + ld1w z8.s, p0/z, [pCRow0] + ld1w z9.s, p0/z, [pCRow0, #1, mul vl] + fmla z8.s, p0/m, z16.s, alphaZ + fmla z9.s, p0/m, z17.s, alphaZ + st1w z8.s, p0, [pCRow0] + st1w z9.s, p0, [pCRow0, #1, mul vl] + + add pCRow0, pCRow0, vec_len, lsl #3 // pC = pC + vec_len * 4 * 2 + +.endm + +.macro INITv1x8 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 + dup z20.s, #0 + dup z21.s, #0 + dup z22.s, #0 + dup z23.s, #0 +.endm + +.macro KERNELv1x8_I + ld1w z0.s, p1/z, [pA1] + ld1w z1.s, p1/z, [pA1, lanes, lsl #2] // next one + add pA1, pA1, lanes, lsl #3 // pA1 = pA1 + lanes * 2 * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M1 + ld1w z1.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + fmla z16.s, p1/m, z0.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z0.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z0.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z0.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z0.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z0.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z0.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z0.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_M2 + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + fmla z16.s, p1/m, z1.s, z8.s + ld1rw z8.s, p0/z, [pB] + fmla z17.s, p1/m, z1.s, z9.s + ld1rw z9.s, p0/z, [pB, 4] + fmla z18.s, p1/m, z1.s, z10.s + ld1rw z10.s, p0/z, [pB, 8] + fmla z19.s, p1/m, z1.s, z11.s + ld1rw z11.s, p0/z, [pB, 12] + fmla z20.s, p1/m, z1.s, z12.s + ld1rw z12.s, p0/z, [pB, 16] + fmla z21.s, p1/m, z1.s, z13.s + ld1rw z13.s, p0/z, [pB, 20] + fmla z22.s, p1/m, z1.s, z14.s + ld1rw z14.s, p0/z, [pB, 24] + fmla z23.s, p1/m, z1.s, z15.s + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 +.endm + +.macro KERNELv1x8_E + fmla z16.s, p1/m, z1.s, z8.s + fmla z17.s, p1/m, z1.s, z9.s + fmla z18.s, p1/m, z1.s, z10.s + fmla z19.s, p1/m, z1.s, z11.s + fmla z20.s, p1/m, z1.s, z12.s + fmla z21.s, p1/m, z1.s, z13.s + fmla z22.s, p1/m, z1.s, z14.s + fmla z23.s, p1/m, z1.s, z15.s +.endm + +.macro KERNELv1x8_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + ld1rw z12.s, p0/z, [pB, 16] + ld1rw z13.s, p0/z, [pB, 20] + ld1rw z14.s, p0/z, [pB, 24] + ld1rw z15.s, p0/z, [pB, 28] + + add pB, pB, 32 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + fmla z20.s, p1/m, z0.s, z12.s + fmla z21.s, p1/m, z0.s, z13.s + fmla z22.s, p1/m, z0.s, z14.s + fmla z23.s, p1/m, z0.s, z15.s + +.endm + +.macro SAVEv1x8 + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1w z28.s, p1/z, [pCRow2] + fmla z28.s, p1/m, z20.s, alphaZ + st1w z28.s, p1, [pCRow2] + + add pCRow2, pCRow1, LDC + ld1w z29.s, p1/z, [pCRow1] + fmla z29.s, p1/m, z21.s, alphaZ + st1w z29.s, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1w z30.s, p1/z, [pCRow2] + fmla z30.s, p1/m, z22.s, alphaZ + st1w z30.s, p1, [pCRow2] + + ld1w z31.s, p1/z, [pCRow1] + fmla z31.s, p1/m, z23.s, alphaZ + st1w z31.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x4 + dup z16.s, #0 + dup z17.s, #0 + dup z18.s, #0 + dup z19.s, #0 +.endm + +.macro KERNELv1x4_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + ld1rw z10.s, p0/z, [pB, 8] + ld1rw z11.s, p0/z, [pB, 12] + + add pB, pB, 16 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + fmla z18.s, p1/m, z0.s, z10.s + fmla z19.s, p1/m, z0.s, z11.s + +.endm + +.macro SAVEv1x4 + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + add pCRow2, pCRow1, LDC + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow1, pCRow2, LDC + ld1w z26.s, p1/z, [pCRow2] + fmla z26.s, p1/m, z18.s, alphaZ + st1w z26.s, p1, [pCRow2] + + ld1w z27.s, p1/z, [pCRow1] + fmla z27.s, p1/m, z19.s, alphaZ + st1w z27.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x2 + dup z16.s, #0 + dup z17.s, #0 +.endm + +.macro KERNELv1x2_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + ld1rw z9.s, p0/z, [pB, 4] + + add pB, pB, 8 + + fmla z16.s, p1/m, z0.s, z8.s + fmla z17.s, p1/m, z0.s, z9.s + +.endm + +.macro SAVEv1x2 + add pCRow1, pCRow0, LDC + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + ld1w z25.s, p1/z, [pCRow1] + fmla z25.s, p1/m, z17.s, alphaZ + st1w z25.s, p1, [pCRow1] + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + +/******************************************************************************/ + +.macro INITv1x1 + dup z16.s, #0 +.endm + +.macro KERNELv1x1_SUB + ld1w z0.s, p1/z, [pA1] + add pA1, pA1, lanes, lsl #2 // pA1 = pA1 + lanes * 4 + + ld1rw z8.s, p0/z, [pB] + + add pB, pB, 4 + + fmla z16.s, p1/m, z0.s, z8.s + +.endm + +.macro SAVEv1x1 + ld1w z24.s, p1/z, [pCRow0] + fmla z24.s, p1/m, z16.s, alphaZ + st1w z24.s, p1, [pCRow0] + + + add pCRow0, pCRow0, lanes, lsl #2 // pC = pC + lanes * 4 + +.endm + + +/******************************************************************************* +* End of macro definitions +*******************************************************************************/ + + PROLOGUE + + .align 5 + add sp, sp, #-(11 * 16) + stp d8, d9, [sp, #(0 * 16)] + stp d10, d11, [sp, #(1 * 16)] + stp d12, d13, [sp, #(2 * 16)] + stp d14, d15, [sp, #(3 * 16)] + stp d16, d17, [sp, #(4 * 16)] + stp x18, x19, [sp, #(5 * 16)] + stp x20, x21, [sp, #(6 * 16)] + stp x22, x23, [sp, #(7 * 16)] + stp x24, x25, [sp, #(8 * 16)] + stp x26, x27, [sp, #(9 * 16)] + str x28, [sp, #(10 * 16)] + + fmov alpha, s0 + dup alphaZ, alpha + cntw vec_len + lsl vec_lenx2, vec_len, #1 + + lsl LDC, LDC, #2 // ldc = ldc * 8 + ptrue p0.s // create true predicate + + mov pB, origPB +// Loop over N + mov counterJ, origN + asr counterJ, counterJ, #3 // J = J / 8 + cmp counterJ, #0 + ble .Lsgemm_kernel_L4_BEGIN + +/******************************************************************************/ +/* Repeat this as long as there are 8 left in N */ + + .align 5 +.Lsgemm_kernel_L8_BEGIN: + mov pCRow0, pC + + add pC, pC, LDC, lsl #3 // add 8 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L8_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 // Check if M < 2*SVE_LEN + blt .Lsgemm_kernel_L8_Mv1_BEGIN + + mov counterI, origM + +/* Until we have at least 2*SVE_LEN iters left in M, we do them with V2*8 kernel */ + + .align 5 +.Lsgemm_kernel_L8_Mv2_20: + + mov pB, origPB + INITv2x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_Mv2_32 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lsgemm_kernel_L8_Mv2_22a + + .align 5 +.Lsgemm_kernel_L8_Mv2_22: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_Mv2_22 + + .align 5 +.Lsgemm_kernel_L8_Mv2_22a: + + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + b .Lsgemm_kernel_L8_Mv2_44 + + .align 5 +.Lsgemm_kernel_L8_Mv2_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_Mv2_40 + + KERNELv2x8_I + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_M2 + KERNELv2x8_M1 + KERNELv2x8_E + + + b .Lsgemm_kernel_L8_Mv2_44 + +.Lsgemm_kernel_L8_Mv2_40: + + INITv2x8 + +.Lsgemm_kernel_L8_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_Mv2_100 + + .align 5 +.Lsgemm_kernel_L8_Mv2_46: + + KERNELv2x8_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L8_Mv2_46 + +.Lsgemm_kernel_L8_Mv2_100: + + SAVEv2x8 + +.Lsgemm_kernel_L8_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L8_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L8_END + +////////////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x8 kernel. +.Lsgemm_kernel_L8_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lsgemm_kernel_L8_Mv1_20: + + mov pB, origPB + INITv1x8 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #2 // is there at least 4 to do? + blt .Lsgemm_kernel_L8_Mv1_32 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #2 // subtract 2 + ble .Lsgemm_kernel_L8_Mv1_22a + + .align 5 +.Lsgemm_kernel_L8_Mv1_22: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L8_Mv1_22 + + .align 5 +.Lsgemm_kernel_L8_Mv1_22a: + + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + b .Lsgemm_kernel_L8_Mv1_44 + + .align 5 +.Lsgemm_kernel_L8_Mv1_32: + + tst counterL, #1 + ble .Lsgemm_kernel_L8_Mv1_40 + + KERNELv1x8_I + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_M2 + KERNELv1x8_M1 + KERNELv1x8_E + + + b .Lsgemm_kernel_L8_Mv1_44 + +.Lsgemm_kernel_L8_Mv1_40: + + INITv1x8 + +.Lsgemm_kernel_L8_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L8_Mv1_100 + + .align 5 +.Lsgemm_kernel_L8_Mv1_46: + + KERNELv1x8_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L8_Mv1_46 + +.Lsgemm_kernel_L8_Mv1_100: + + SAVEv1x8 + +.Lsgemm_kernel_L8_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + b.any .Lsgemm_kernel_L8_Mv1_20 + +.Lsgemm_kernel_L8_END: + + lsl temp, origK, #5 + add origPB, origPB, temp // B = B + K * 8 * 4 + + subs counterJ, counterJ , #1 // j-- + bgt .Lsgemm_kernel_L8_BEGIN + +/******************************************************************************/ +/* Repeat the same thing if 4 left in N */ + + .align 5 +.Lsgemm_kernel_L4_BEGIN: + + mov counterJ , origN + tst counterJ , #4 + ble .Lsgemm_kernel_L2_BEGIN + + + mov pCRow0, pC + + add pC, pC, LDC, lsl #2 // add 4 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L4_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L4_Mv1_BEGIN + + mov counterI, origM + + .align 5 +.Lsgemm_kernel_L4_Mv2_20: + + mov pB, origPB + INITv2x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L4_Mv2_44 + + .align 5 +.Lsgemm_kernel_L4_Mv2_22: + + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_Mv2_22 + +.Lsgemm_kernel_L4_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L4_Mv2_100 + + .align 5 +.Lsgemm_kernel_L4_Mv2_46: + + KERNELv2x4_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L4_Mv2_46 + +.Lsgemm_kernel_L4_Mv2_100: + + SAVEv2x4 + +.Lsgemm_kernel_L4_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L4_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L4_END + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x4 kernel. +.Lsgemm_kernel_L4_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s // lanes contain number of active SVE lanes in M dimension + + .align 5 +.Lsgemm_kernel_L4_Mv1_20: + + mov pB, origPB + INITv1x4 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L4_Mv1_44 + + .align 5 +.Lsgemm_kernel_L4_Mv1_22: + + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L4_Mv1_22 + +.Lsgemm_kernel_L4_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L4_Mv1_100 + + .align 5 +.Lsgemm_kernel_L4_Mv1_46: + + KERNELv1x4_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L4_Mv1_46 + +.Lsgemm_kernel_L4_Mv1_100: + + SAVEv1x4 + +.Lsgemm_kernel_L4_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L4_Mv1_20 + + +.Lsgemm_kernel_L4_END: + lsl temp, origK, #4 + add origPB, origPB, temp // B = B + K * 4 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 2 left in N */ + + .align 5 +.Lsgemm_kernel_L2_BEGIN: + + mov counterJ , origN + tst counterJ , #2 + ble .Lsgemm_kernel_L1_BEGIN + + mov pCRow0, pC + + add pC, pC, LDC, lsl #1 // add 2 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L2_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L2_Mv1_BEGIN + + mov counterI, origM + + .align 5 +.Lsgemm_kernel_L2_Mv2_20: + + mov pB, origPB + INITv2x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L2_Mv2_44 + + .align 5 +.Lsgemm_kernel_L2_Mv2_22: + + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_Mv2_22 + +.Lsgemm_kernel_L2_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L2_Mv2_100 + + .align 5 +.Lsgemm_kernel_L2_Mv2_46: + + KERNELv2x2_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L2_Mv2_46 + +.Lsgemm_kernel_L2_Mv2_100: + + SAVEv2x2 + +.Lsgemm_kernel_L2_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L2_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L2_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x2 kernel. +.Lsgemm_kernel_L2_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lsgemm_kernel_L2_Mv1_20: + + mov pB, origPB + INITv1x2 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 4 to do? + ble .Lsgemm_kernel_L2_Mv1_44 + + .align 5 +.Lsgemm_kernel_L2_Mv1_22: + + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L2_Mv1_22 + +.Lsgemm_kernel_L2_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L2_Mv1_100 + + .align 5 +.Lsgemm_kernel_L2_Mv1_46: + + KERNELv1x2_SUB + + subs counterL, counterL, #1 + bne .Lsgemm_kernel_L2_Mv1_46 + +.Lsgemm_kernel_L2_Mv1_100: + + SAVEv1x2 + +.Lsgemm_kernel_L2_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L2_Mv1_20 + + +.Lsgemm_kernel_L2_END: + add origPB, origPB, origK, lsl #3 // B = B + K * 2 * 4 + +/******************************************************************************/ +/* Repeat the same thing if 1 left in N */ + + .align 5 +.Lsgemm_kernel_L1_BEGIN: + + mov counterJ , origN + tst counterJ , #1 + ble .Lsgemm_kernel_L999 // done + + mov pCRow0, pC + + add pC, pC, LDC // add 1 x LDC + + mov pA1, origPA // pA1 = start of A array + +.Lsgemm_kernel_L1_Mv2_BEGIN: + + mov counterI, #0 + cmp origM, vec_lenx2 + blt .Lsgemm_kernel_L1_Mv1_BEGIN + + mov counterI, origM + + .align 5 +.Lsgemm_kernel_L1_Mv2_20: + + mov pB, origPB + INITv2x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lsgemm_kernel_L1_Mv2_44 + + .align 5 +.Lsgemm_kernel_L1_Mv2_22: + + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv2_22 + +.Lsgemm_kernel_L1_Mv2_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L1_Mv2_100 + + .align 5 +.Lsgemm_kernel_L1_Mv2_46: + + KERNELv2x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv2_46 + +.Lsgemm_kernel_L1_Mv2_100: + + SAVEv2x1 + +.Lsgemm_kernel_L1_Mv2_END: + sub counterI, counterI, vec_lenx2 + cmp counterI, vec_lenx2 + bge .Lsgemm_kernel_L1_Mv2_20 + sub counterI, origM, counterI + + cmp counterI, origM + beq .Lsgemm_kernel_L1_END + + +////////////////////////////////// +// We have less than 2*SVE_LEN left. We do this with V1x1 kernel. +.Lsgemm_kernel_L1_Mv1_BEGIN: + + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + + .align 5 +.Lsgemm_kernel_L1_Mv1_20: + + mov pB, origPB + INITv1x1 // fill with zeros + + asr counterL , origK, #3 // L = K / 8 + cmp counterL , #0 // is there at least 8 to do? + ble .Lsgemm_kernel_L1_Mv1_44 + + .align 5 +.Lsgemm_kernel_L1_Mv1_22: + + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv1_22 + +.Lsgemm_kernel_L1_Mv1_44: + + ands counterL , origK, #7 + ble .Lsgemm_kernel_L1_Mv1_100 + + .align 5 +.Lsgemm_kernel_L1_Mv1_46: + + KERNELv1x1_SUB + + subs counterL, counterL, #1 + bgt .Lsgemm_kernel_L1_Mv1_46 + +.Lsgemm_kernel_L1_Mv1_100: + + SAVEv1x1 + +.Lsgemm_kernel_L1_Mv1_END: + + incw counterI + whilelt p1.s, counterI, origM //SVE instruction + cntp lanes, p0, p1.s + b.any .Lsgemm_kernel_L1_Mv1_20 + + +.Lsgemm_kernel_L1_END: + +/******************************************************************************/ + +.Lsgemm_kernel_L999: + mov x0, #0 // set return value + ldp d8, d9, [sp, #(0 * 16)] + ldp d10, d11, [sp, #(1 * 16)] + ldp d12, d13, [sp, #(2 * 16)] + ldp d14, d15, [sp, #(3 * 16)] + ldp d16, d17, [sp, #(4 * 16)] + ldp x18, x19, [sp, #(5 * 16)] + ldp x20, x21, [sp, #(6 * 16)] + ldp x22, x23, [sp, #(7 * 16)] + ldp x24, x25, [sp, #(8 * 16)] + ldp x26, x27, [sp, #(9 * 16)] + ldr x28, [sp, #(10 * 16)] + add sp, sp, #(11*16) + ret + + EPILOGUE + diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index e60a5d65b3..cf4be48970 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -229,7 +229,7 @@ gotoblas_t TABLE_NAME = { ssyrk_direct_alpha_betaLTTS, #endif - sgemm_kernelTS, sgemm_betaTS, + sgemm_kernelTS, sgemm_betaTS, scomm_kernelTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N sgemm_incopyTS, sgemm_itcopyTS, #else @@ -267,6 +267,7 @@ gotoblas_t TABLE_NAME = { strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, #endif + scomm_ncopyTS, scomm_tcopyTS, strmm_ounucopyTS, strmm_ounncopyTS, strmm_outucopyTS, strmm_outncopyTS, strmm_olnucopyTS, strmm_olnncopyTS, strmm_oltucopyTS, strmm_oltncopyTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N @@ -321,7 +322,7 @@ gotoblas_t TABLE_NAME = { #endif #if (BUILD_DOUBLE==1) || (BUILD_COMPLEX16==1) - dgemm_kernelTS, dgemm_betaTS, + dgemm_kernelTS, dgemm_betaTS, dcomm_kernelTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N dgemm_incopyTS, dgemm_itcopyTS, #else @@ -356,6 +357,7 @@ gotoblas_t TABLE_NAME = { dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, #endif + dcomm_ncopyTS, dcomm_tcopyTS, dtrmm_ounucopyTS, dtrmm_ounncopyTS, dtrmm_outucopyTS, dtrmm_outncopyTS, dtrmm_olnucopyTS, dtrmm_olnncopyTS, dtrmm_oltucopyTS, dtrmm_oltncopyTS, #if DGEMM_DEFAULT_UNROLL_M != DGEMM_DEFAULT_UNROLL_N @@ -467,6 +469,7 @@ gotoblas_t TABLE_NAME = { #if (BUILD_COMPLEX) cgemm_kernel_nTS, cgemm_kernel_lTS, cgemm_kernel_rTS, cgemm_kernel_bTS, cgemm_betaTS, + ccomm_kernel_nTS, ccomm_kernel_lTS, ccomm_kernel_rTS, ccomm_kernel_bTS, #if CGEMM_DEFAULT_UNROLL_M != CGEMM_DEFAULT_UNROLL_N cgemm_incopyTS, cgemm_itcopyTS, #else @@ -512,6 +515,7 @@ gotoblas_t TABLE_NAME = { ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, #endif + ccomm_ncopyTS, ccomm_tcopyTS, ctrmm_ounucopyTS, ctrmm_ounncopyTS, ctrmm_outucopyTS, ctrmm_outncopyTS, ctrmm_olnucopyTS, ctrmm_olnncopyTS, ctrmm_oltucopyTS, ctrmm_oltncopyTS, @@ -623,6 +627,7 @@ gotoblas_t TABLE_NAME = { zgemm_kernel_nTS, zgemm_kernel_lTS, zgemm_kernel_rTS, zgemm_kernel_bTS, zgemm_betaTS, + zcomm_kernel_nTS, zcomm_kernel_lTS, zcomm_kernel_rTS, zcomm_kernel_bTS, #if ZGEMM_DEFAULT_UNROLL_M != ZGEMM_DEFAULT_UNROLL_N zgemm_incopyTS, zgemm_itcopyTS, @@ -666,6 +671,7 @@ gotoblas_t TABLE_NAME = { ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS, #endif + zcomm_ncopyTS, zcomm_tcopyTS, ztrmm_ounucopyTS, ztrmm_ounncopyTS, ztrmm_outucopyTS, ztrmm_outncopyTS, ztrmm_olnucopyTS, ztrmm_olnncopyTS, ztrmm_oltucopyTS, ztrmm_oltncopyTS,