我的裸机程序手动调用memset()到零个完整的/对齐的4k页面,如下所示(我不是使用uint64_t,而是另一个8字节的东西):

  uint64_t something[512] __attribute__((aligned(4096)));
  memset(something, 0x0, 4096);

我正在编写类似于此的.

%> /path/to/gcc-11.1.0/bin/aarch64-unknown-elf-gcc \
     -O3 \
     -std=gnu99 \
     -nostartfiles \
     --specs=nano.specs \
     -march=armv8.1-a \
     -Wl,--gc-sections \
     -Tlinker_script.lds \
     my_code.c \
     -o my_code.elf

当我反汇编并查看链接的memset()时,它是这个基本/通用/一次一个字节的实现:

000000004010399c <memset>:
      s = (char*)aligned_addr;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (n--)
    4010399c:   d2800003    mov x3, #0x0                    // #0
    401039a0:   eb03005f    cmp x2, x3
    401039a4:   54000041    b.ne    401039ac <memset+0x10>  // b.any
    *s++ = (char) c;

  return m;
}
    401039a8:   d65f03c0    ret
    *s++ = (char) c;
    401039ac:   38236801    strb    w1, [x0, x3]
    401039b0:   91000463    add x3, x3, #0x1
    401039b4:   17fffffb    b   401039a0 <memset+0x4>

我期待一个使用stp或向量指令的aarch64优化版本.我的编译器有一个/path/to/gcc-11.1.0/newlib-nano子目录.

我已经删除了--specs=nano.specs个选项,并摆弄了各种选项,但我不确定我在这里能做什么……

HOW CAN I GET THE OPTIMIZED 100 IMPLEMENTATION

注意,我使用/path/to/gcc-11.1.0/bin/aarch64-unknown-elf-ar x从许多不同的*.a文件中提取libc_a.memset.o文件,但它们都是空的:/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc.a/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libc_nano.a/path/to/gcc-11.1.0/newlib-nano/aarch64-unknown-elf/lib/libg.a/path/to/gcc-11.1.0/aarch64-unknown-elf/lib/libc.a,等等.不是逐字节实现,而是空的.我这样做是为了寻找一个好的实现,但我显然不明白这里发生了什么……

推荐答案

您可以在这里看到代码.您的库是使用定义PREFER_SIZE_OVER_SPEED编译的.您需要重新编译您的库.

https://github.com/eblot/newlib/blob/master/newlib/libc/string/memset.c

/*
FUNCTION
    <<memset>>---set an area of memory
INDEX
    memset
ANSI_SYNOPSIS
    #include <string.h>
    void *memset(void *<[dst]>, int <[c]>, size_t <[length]>);
TRAD_SYNOPSIS
    #include <string.h>
    void *memset(<[dst]>, <[c]>, <[length]>)
    void *<[dst]>;
    int <[c]>;
    size_t <[length]>;
DESCRIPTION
    This function converts the argument <[c]> into an unsigned
    char and fills the first <[length]> characters of the array
    pointed to by <[dst]> to the value.
RETURNS
    <<memset>> returns the value of <[dst]>.
PORTABILITY
<<memset>> is ANSI C.
    <<memset>> requires no supporting OS subroutines.
QUICKREF
    memset ansi pure
*/

#include <string.h>

#define LBLOCKSIZE (sizeof(long))
#define UNALIGNED(X)   ((long)X & (LBLOCKSIZE - 1))
#define TOO_SMALL(LEN) ((LEN) < LBLOCKSIZE)

_PTR
_DEFUN (memset, (m, c, n),
    _PTR m _AND
    int c _AND
    size_t n)
{
  char *s = (char *) m;

#if !defined(PREFER_SIZE_OVER_SPEED) && !defined(__OPTIMIZE_SIZE__)
  int i;
  unsigned long buffer;
  unsigned long *aligned_addr;
  unsigned int d = c & 0xff;    /* To avoid sign extension, copy C to an
                   unsigned variable.  */

  while (UNALIGNED (s))
    {
      if (n--)
        *s++ = (char) c;
      else
        return m;
    }

  if (!TOO_SMALL (n))
    {
      /* If we get this far, we know that n is large and s is word-aligned. */
      aligned_addr = (unsigned long *) s;

      /* Store D into each char sized location in BUFFER so that
         we can set large blocks quickly.  */
      buffer = (d << 8) | d;
      buffer |= (buffer << 16);
      for (i = 32; i < LBLOCKSIZE * 8; i <<= 1)
        buffer = (buffer << i) | buffer;

      /* Unroll the loop.  */
      while (n >= LBLOCKSIZE*4)
        {
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          *aligned_addr++ = buffer;
          n -= 4*LBLOCKSIZE;
        }

      while (n >= LBLOCKSIZE)
        {
          *aligned_addr++ = buffer;
          n -= LBLOCKSIZE;
        }
      /* Pick up the remainder with a bytewise loop.  */
      s = (char*)aligned_addr;
    }

#endif /* not PREFER_SIZE_OVER_SPEED */

  while (n--)
    *s++ = (char) c;

  return m;
}

C++相关问答推荐

如果实际的syscall是CLONE(),那么为什么strace接受fork()呢?

如何解决C中的严格别名?

警告:C++中数组下标的类型为‘char’[-Wchar-subpts]

变量>;-1如何在C中准确求值?

如果我释放其他内容,返回值就会出错

二进制计算器与gmp

为什么该函数不将参数值保存到数据 struct 中?

tick.q中的Kdb+键控表语法

<;unistd.h>;和<;sys/unistd.h>;之间有什么区别?

将变量或参数打包到 struct /联合中是否会带来意想不到的性能损失?

用C++构建和使用DLL的困惑

将数字的每一位数平方,并使用C将它们连接为一个数字(程序不能正确处理0)

即使我在C++中空闲,也肯定会丢失内存

OMP并行嵌套循环

Leet代码运行时错误:代码不会在Leet代码上编译,而是在其他编译器中编译,如netbeans和在线编译器

c如何传递对 struct 数组的引用,而不是设置 struct 的副本

(GNU+Linux) 多个线程同时调用malloc()

创建 makefile 来编译位于不同目录中的多个源文件

C 中从 Unix 纪元时间转换的损坏

当 a 是代码块时使用逗号运算符 (a, b)