介绍
我有一个相当简单的Cython模块--为了进行特定的测试,我进一步简化了它.
这个模块只有一个类,它只有一个有趣的方法(名为run
):这个方法接受一个Fortran排序的2D NumPy数组和两个1D NumPy数组作为输入,并在这些数组上做一些非常非常简单的事情(参见下面的代码).
为了进行基准测试,我在Windows 10 64位、Python3.9.10 64位、Cython0.29.32上与MSVC、GCC 8、GCC 11、GCC 12和GCC 13编译了完全相同的模块.我从优秀的Brecht Sanders GitHub页面(https://github.com/brechtsanders)获得的所有GCC编译器.
主要问题
这篇很长的帖子的首要问题是:我只是好奇,是否有人能解释为什么GCC12和GCC13比GCC11慢得多(GCC11是最快的).看起来GCC的每一期演出都在下降,而不是变得更好……
基准
在基准测试中,我简单地改变了2D和1D数组的数组维度(m
和n
)以及2D和1D数组中非零条目的数量.我 for each 编译器版本重复了20次run
方法,每组m
和n
条目以及非零条目.
我正在使用的优化设置:
MVSC个
MSVC_EXTRA_COMPILE_ARGS = ['/O2', '/GS-', '/fp:fast', '/Ob2', '/nologo', '/arch:AVX512', '/Ot', '/GL']
GCC个
GCC_EXTRA_COMPILE_ARGS = ['-Ofast', '-funroll-loops', '-flto', '-ftree-vectorize', '-march=native', '-fno-asynchronous-unwind-tables']
GCC_EXTRA_LINK_ARGS = ['-flto'] + GCC_EXTRA_COMPILE_ARGS
我观察到的是以下几点:
- 到目前为止,MSVC在执行基准测试方面是最慢的(为什么会在Windows上?)
- 由于GCC11比GCC8更快,因此GCC8-&GCC11的发展前景看好
- GCC12和GCC13都比GCC11慢得多,其中GCC13最差(比GCC11慢一倍,比GCC12差得多)
Table of Results:个
Runtimes are in milliseconds (ms)个个
Graph (NOTE: Logarithmic Y axis!!):个
Runtimes are in milliseconds (ms)个个
代码
Cython file:个
###############################################################################
import numpy as np
cimport numpy as np
import cython
from cython.view cimport array as cvarray
from libc.float cimport FLT_MAX
DTYPE_float = np.float32
ctypedef np.float32_t DTYPE_float_t
cdef float SMALL = 1e-10
cdef int MAXSIZE = 1000000
cdef extern from "math.h" nogil:
cdef float fabsf(float x)
###############################################################################
@cython.final
cdef class CythonLoops:
cdef int m, n
cdef int [:] col_starts
cdef int [:] row_indices
cdef double [:] x
def __cinit__(self):
self.m = 0
self.n = 0
self.col_starts = cvarray(shape=(MAXSIZE,), itemsize=sizeof(int), format='i')
self.row_indices = cvarray(shape=(MAXSIZE,), itemsize=sizeof(int), format='i')
self.x = cvarray(shape=(MAXSIZE,), itemsize=sizeof(double), format='d')
@cython.boundscheck(False) # turn off bounds-checking for entire function
@cython.wraparound(False) # turn off negative index wrapping for entire function
@cython.nonecheck(False)
@cython.initializedcheck(False)
@cython.cdivision(True)
cpdef run(self, DTYPE_float_t[::1, :] matrix,
DTYPE_float_t[:] ub_values,
DTYPE_float_t[:] priority):
cdef Py_ssize_t i, j, m, n
cdef int nza, collen
cdef double too_large, ok, obj
cdef float ub, element
cdef int [:] col_starts = self.col_starts
cdef int [:] row_indices = self.row_indices
cdef double [:] x = self.x
m = matrix.shape[0]
n = matrix.shape[1]
self.m = m
self.n = n
nza = 0
collen = 0
for i in range(n):
for j in range(m+1):
if j == 0:
element = priority[i]
else:
element = matrix[j-1, i]
if fabsf(element) < SMALL:
continue
if j == 0:
obj = <double>element
# Do action 1 with external library
else:
collen = nza + 1
col_starts[collen] = i+1
row_indices[collen] = j
x[collen] = <double>element
nza += 1
ub = ub_values[i]
if ub > FLT_MAX:
too_large = 0.0
# Do action 2 with external library
elif ub > SMALL:
ok = <double>ub
# Do action 3 with external library
# Use x, row_indices and col_starts in the external library
Setup file:个
我使用以下代码来编译它:
python setup.py build_ext --inplace --compiler=mingw32 gcc13
其中最后一个参数是我要测试的编译器
#!/usr/bin/env python
from setuptools import setup
from setuptools import Extension
from Cython.Build import cythonize
from Cython.Distutils import build_ext
import numpy as np
import os
import shutil
import sys
import getpass
MODULE = 'loop_cython_%s'
GCC_EXTRA_COMPILE_ARGS = ['-Ofast', '-funroll-loops', '-flto', '-ftree-vectorize', '-march=native', '-fno-asynchronous-unwind-tables']
GCC_EXTRA_LINK_ARGS = ['-flto'] + GCC_EXTRA_COMPILE_ARGS
MSVC_EXTRA_COMPILE_ARGS = ['/O2', '/GS-', '/fp:fast', '/Ob2', '/nologo', '/arch:AVX512', '/Ot', '/GL']
MSVC_EXTRA_LINK_ARGS = MSVC_EXTRA_COMPILE_ARGS
def remove_builds(kind):
for folder in ['build', 'bin']:
if os.path.isdir(folder):
if folder == 'bin':
continue
shutil.rmtree(folder, ignore_errors=True)
if os.path.isfile(MODULE + '_%s.c'%kind):
os.remove(MODULE + '_%s.c'%kind)
def setenv(extra_args, doset=True, path=None, kind='gcc8'):
flags = ''
if doset:
flags = ' '.join(extra_args)
for key in ['CFLAGS', 'FFLAGS', 'CPPFLAGS']:
os.environ[key] = flags
user = getpass.getuser()
if doset:
path = os.environ['PATH']
if kind == 'gcc8':
os.environ['PATH'] = r'C:\Users\%s\Tools\MinGW64_8.0\bin;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\%s\WinPython39\WPy64-39100\python-3.9.10.amd64;'%(user, user)
elif kind == 'gcc11':
os.environ['PATH'] = r'C:\Users\%s\Tools\MinGW64\bin;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\%s\WinPython39\WPy64-39100\python-3.9.10.amd64;'%(user, user)
elif kind == 'gcc12':
os.environ['PATH'] = r'C:\Users\%s\Tools\MinGW64_12.2.0\bin;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\%s\WinPython39\WPy64-39100\python-3.9.10.amd64;'%(user, user)
elif kind == 'gcc13':
os.environ['PATH'] = r'C:\Users\%s\Tools\MinGW64_13.2.0\bin;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\%s\WinPython39\WPy64-39100\python-3.9.10.amd64;'%(user, user)
elif kind == 'msvc':
os.environ['PATH'] = r'C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\bin\Hostx64\x64;C:\WINDOWS\system32;C:\WINDOWS;C:\Users\J0514162\WinPython39\WPy64-39100\python-3.9.10.amd64;C:\Program Files (x86)\Windows Kits\10\bin\10.0.22000.0\x64'
os.environ['LIB'] = r'C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\lib\x64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22000.0\um\x64;C:\Program Files (x86)\Windows Kits\10\Lib\10.0.22000.0\ucrt\x64'
os.environ["DISTUTILS_USE_SDK"] = '1'
os.environ["MSSdk"] = '1'
else:
os.environ['PATH'] = path
return path
class CustomBuildExt(build_ext):
def build_extensions(self):
# Override the compiler executables. Importantly, this
# removes the "default" compiler flags that would
# otherwise get passed on to to the compiler, i.e.,
# distutils.sysconfig.get_var("CFLAGS").
self.compiler.set_executable("compiler_so", "gcc -mdll -O -Wall -DMS_WIN64")
self.compiler.set_executable("compiler_cxx", "g++ -O -Wall -DMS_WIN64")
self.compiler.set_executable("linker_so", "gcc -shared -static")
self.compiler.dll_libraries = []
build_ext.build_extensions(self)
if __name__ == '__main__':
os.system('cls')
kind = None
for arg in sys.argv:
if arg.strip() in ['gcc8', 'gcc11', 'gcc12', 'gcc13', 'msvc']:
kind = arg
sys.argv.remove(arg)
break
base_file = os.path.join(os.getcwd(), MODULE[0:-3])
source = base_file + '.pyx'
target = base_file + '_%s.pyx'%kind
shutil.copyfile(source, target)
if kind == 'msvc':
extra_compile_args = MSVC_EXTRA_COMPILE_ARGS[:]
extra_link_args = MSVC_EXTRA_LINK_ARGS[:] + ['/MANIFEST']
else:
extra_compile_args = GCC_EXTRA_COMPILE_ARGS[:]
extra_link_args = GCC_EXTRA_LINK_ARGS[:]
path = setenv(extra_compile_args, kind=kind)
remove_builds(kind)
define_macros = [('WIN32', 1)]
nname = MODULE%kind
include_dirs = [np.get_include()]
if kind == 'msvc':
include_dirs += [r'C:\Program Files (x86)\Windows Kits\10\Include\10.0.22000.0\ucrt',
r'C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.35.32215\include',
r'C:\Program Files (x86)\Windows Kits\10\Include\10.0.22000.0\shared']
extensions = [
Extension(nname, [nname + '.pyx'],
extra_compile_args=extra_compile_args,
extra_link_args=extra_link_args,
include_dirs=include_dirs,
define_macros=define_macros)]
# build the core extension(s)
setup_kwargs = {'ext_modules': cythonize(extensions,
compiler_directives={'embedsignature' : False,
'boundscheck' : False,
'wraparound' : False,
'initializedcheck': False,
'cdivision' : True,
'language_level' : '3str',
'nonecheck' : False},
force=True,
cache=False,
quiet=False)}
if kind != 'msvc':
setup_kwargs['cmdclass'] = {'build_ext': CustomBuildExt}
setup(**setup_kwargs)
setenv([], False, path)
remove_builds(kind)
Test code:个
import os
import numpy
import time
import loop_cython_msvc as msvc
import loop_cython_gcc8 as gcc8
import loop_cython_gcc11 as gcc11
import loop_cython_gcc12 as gcc12
import loop_cython_gcc13 as gcc13
# M N NNZ(matrix) NNZ(priority) NNZ(ub)
DIMENSIONS = [(1661 , 2608 , 3560 , 375 , 2488 ),
(2828 , 3512 , 4333 , 413 , 2973 ),
(780 , 985 , 646 , 23 , 984 ),
(799 , 1558 , 1883 , 301 , 1116 ),
(399 , 540 , 388 , 44 , 517 ),
(10545, 10486, 14799 , 1053 , 10041),
(3369 , 3684 , 3684 , 256 , 3242 ),
(2052 , 5513 , 4772 , 1269 , 3319 ),
(224 , 628 , 1345 , 396 , 594 ),
(553 , 1475 , 1315 , 231 , 705 )]
def RunTest():
print('M N NNZ MSVC GCC 8 GCC 11 GCC 12 GCC 13')
for m, n, nnz_mat, nnz_priority, nnz_ub in DIMENSIONS:
print('%-6d %-6d %-8d'%(m, n, nnz_mat), end='')
for solver, label in zip([msvc, gcc8, gcc11, gcc12, gcc13], ['MSVC', 'GCC 8', 'GCC 11', 'GCC 12', 'GCC 13']):
numpy.random.seed(123456)
size = m*n
idxes = numpy.arange(size)
matrix = numpy.zeros((size, ), dtype=numpy.float32)
idx_mat = numpy.random.choice(idxes, nnz_mat)
matrix[idx_mat] = numpy.random.uniform(0, 1000, size=(nnz_mat, ))
matrix = numpy.asfortranarray(matrix.reshape((m, n)))
idxes = numpy.arange(m)
priority = numpy.zeros((m, ), dtype=numpy.float32)
idx_pri = numpy.random.choice(idxes, nnz_priority)
priority[idx_pri] = numpy.random.uniform(0, 1000, size=(nnz_priority, ))
idxes = numpy.arange(n)
ub_values = numpy.inf*numpy.ones((n, ), dtype=numpy.float32)
idx_ub = numpy.random.choice(idxes, nnz_ub)
ub_values[idx_ub] = numpy.random.uniform(0, 1000, size=(nnz_ub, ))
solver = solver.CythonLoops()
time_tot = []
for i in range(20):
start = time.perf_counter()
solver.run(matrix, ub_values, priority)
elapsed = time.perf_counter() - start
time_tot.append(elapsed*1e3)
print('%-8.4g'%numpy.mean(time_tot), end=' ')
print()
if __name__ == '__main__':
os.system('cls')
RunTest()
EDIT
在@PeterCordes comments 之后,我将优化标志更改为:
MSVC_EXTRA_COMPILE_ARGS = ['/O2', '/GS-', '/fp:fast', '/Ob2', '/nologo', '/arch:AVX512', '/Ot', '/GL', '/QIntel-jcc-erratum']
GCC_EXTRA_COMPILE_ARGS = ['-Ofast', '-funroll-loops', '-flto', '-ftree-vectorize', '-march=native', '-fno-asynchronous-unwind-tables', '-Wa,-mbranches-within-32B-boundaries']
MSVC似乎比以前略快(在5%到10%之间),但GCC12和GCC13比以前慢(在3%到20%之间).下面是一个带有最大2D矩阵结果的图表:
Note:"当前"表示@PeterCordes建议的最新优化标志,"之前"是原始标志集.