initial commit

This commit is contained in:
2026-04-12 22:20:18 +08:00
commit 190c2edbb2
155 changed files with 36314 additions and 0 deletions

View File

@@ -0,0 +1,14 @@
TARGET = inner_product
CFLAGS += -O3 -g
#根据SIMU宏选择串口波特率,0FPGA上板1仿真
CFLAGS += -DSIMU=0
C_SRCS := $(wildcard ./*.c )
OBJDIR = obj
COMMON_DIR = ../../../bsp
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
PICOLIBC_DIR=../../../../toolchains/picolibc
include ../../../bsp/common.mk

View File

@@ -0,0 +1,446 @@
/*
Copyright 2008 Adobe Systems Incorporated
Copyright 2018-2019 Chris Cox
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html )
Goal: Test performance of various idioms for calculating the inner product of two sequences.
NOTE: Inner products are common in mathematical and geometry processing applications,
plus some audio and image processing.
Assumptions:
1) The compiler will optimize inner product operations.
2) The compiler may recognize ineffecient inner product idioms
and substitute efficient methods when it can.
NOTE: the best method is highly dependent on the data types and CPU architecture
3) std::inner_product will be well optimized for all types and containers.
*/
/******************************************************************************/
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
/******************************************************************************/
/******************************************************************************/
//BSP板级支持包所需全局变量
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
clock_t start_time, end_time;
// this constant may need to be adjusted to give reasonable minimum times
// For best results, times should be about 1.0 seconds for the minimum test run
int iterations = 5;
// 8000 items, or between 8 and 64k of data
// this is intended to remain within the L2 cache of most common CPUs
const int SIZE = 8000;
// initial value for filling our arrays, may be changed from the command line
int32_t init_value_8 = 3;
int32_t init_value_16 = 211;
int32_t init_value_32 = 1065;
float init_value_f16 = 5.0;
double init_value_f32 = 365.0;
/******************************************************************************/
/******************************************************************************/
void fill_8(int8_t * first, int8_t * last, int8_t value) {
while (first != last) *first++ = (int8_t)(value);
}
void fill_u8(uint8_t * first, uint8_t * last, uint8_t value) {
while (first != last) *first++ = (uint8_t)(value);
}
void fill_16(int16_t * first, int16_t * last, int16_t value) {
while (first != last) *first++ = (int16_t)(value);
}
void fill_u16(uint16_t * first, uint16_t * last, uint16_t value) {
while (first != last) *first++ = (uint16_t)(value);
}
void fill_32(int32_t * first, int32_t * last, int32_t value) {
while (first != last) *first++ = (int32_t)(value);
}
void fill_u32(uint32_t * first, uint32_t * last, uint32_t value) {
while (first != last) *first++ = (uint32_t)(value);
}
void fill_f16(float * first, float * last, float value) {
while (first != last) *first++ = (float)(value);
}
void fill_f32(double * first, double * last, double value) {
while (first != last) *first++ = (double)(value);
}
/******************************************************************************/
/******************************************************************************/
// a trivial for loop
void test_inner_product_8( const int8_t* first, const int8_t* second, const size_t count, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
int8_t sum = 0 ;
for (size_t j = 0; j < count; ++j) {
sum += first[j] * second[j];
}
//check_sum( sum, label );
int8_t target = (int8_t)(init_value_8)*(int8_t)(init_value_8)*SIZE;
if ( abs( sum - target ) > (int8_t)(1.0e-6) )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("CLOCKS_PER_SEC=%d\n",CLOCKS_PER_SEC);
printf("\"%s, %lu items\" %f sec\n",
label,
count,
time_cost);
}
void test_inner_product_u8( const uint8_t* first, const uint8_t* second, const size_t count, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
uint8_t sum = 0 ;
for (size_t j = 0; j < count; ++j) {
sum += first[j] * second[j];
}
//check_sum( sum, label );
uint8_t target = (uint8_t)(init_value_8)*(uint8_t)(init_value_8)*SIZE;
if ( ( sum - target ) > (uint8_t)(1.0e-6) )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %lu items\" %f sec\n",
label,
count,
time_cost);
}
void test_inner_product_16( const int16_t* first, const int16_t* second, const size_t count, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
int16_t sum = 0 ;
for (size_t j = 0; j < count; ++j) {
sum += first[j] * second[j];
}
//check_sum( sum, label );
int16_t target = (int16_t)(init_value_16)*(int16_t)(init_value_16)*SIZE;
if ( abs( sum - target ) > (int16_t)(1.0e-6) )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %lu items\" %f sec\n",
label,
count,
time_cost);
}
void test_inner_product_u16( const uint16_t* first, const uint16_t* second, const size_t count, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
uint16_t sum = 0 ;
for (size_t j = 0; j < count; ++j) {
sum += first[j] * second[j];
}
//check_sum( sum, label );
uint16_t target = (uint16_t)(init_value_16)*(uint16_t)(init_value_16)*SIZE;
if ( ( sum - target ) > (uint16_t)(1.0e-6) )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %lu items\" %f sec\n",
label,
count,
time_cost);
}
void test_inner_product_32( const int32_t* first, const int32_t* second, const size_t count, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
int32_t sum = 0 ;
for (size_t j = 0; j < count; ++j) {
sum += first[j] * second[j];
}
//check_sum( sum, label );
int32_t target = (int32_t)(init_value_32)*(int32_t)(init_value_32)*SIZE;
if ( abs( sum - target ) > (int32_t)(1.0e-6) )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %lu items\" %f sec\n",
label,
count,
time_cost);
}
void test_inner_product_u32( const uint32_t* first, const uint32_t* second, const size_t count, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
uint32_t sum = 0 ;
for (size_t j = 0; j < count; ++j) {
sum += first[j] * second[j];
}
//check_sum( sum, label );
uint32_t target = (uint32_t)(init_value_32)*(uint32_t)(init_value_32)*SIZE;
if ( ( sum - target ) > (uint32_t)(1.0e-6) )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %lu items\" %f sec\n",
label,
count,
time_cost);
}
void test_inner_product_f16( const float* first, const float* second, const size_t count, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
float sum = 0 ;
for (size_t j = 0; j < count; ++j) {
sum += first[j] * second[j];
}
//check_sum( sum, label );
float target = (float)(init_value_f16)*(float)(init_value_f16)*SIZE;
if ( fabs( sum - target ) > (float)(1.0e-6) )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %lu items\" %f sec\n",
label,
count,
time_cost);
}
void test_inner_product_f32( const double* first, const double* second, const size_t count, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
double sum = 0 ;
for (size_t j = 0; j < count; ++j) {
sum += first[j] * second[j];
}
//check_sum( sum, label );
double target = (double)(init_value_f32)*(double)(init_value_f32)*SIZE;
if ( fabs( sum - target ) > (double)(1.0e-6) )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %lu items\" %f sec\n",
label,
count,
time_cost);
}
/******************************************************************************/
/******************************************************************************/
// NOTE - can't make generic template template argument without C++17
// I would like to have TestOneFunction to handle all the types and if's, but need to use different types with it inside
// see sum_sequence.cpp
void TestOneType_8()
{
int8_t data[SIZE];
int8_t dataB[SIZE];
fill_8(data, data+SIZE, (int8_t)(init_value_8));
fill_8(dataB, dataB+SIZE, (int8_t)(init_value_8));
test_inner_product_8( data, dataB, SIZE, "int_8 inner_product1 to int_8");
}
void TestOneType_u8()
{
uint8_t data[SIZE];
uint8_t dataB[SIZE];
fill_u8(data, data+SIZE, (uint8_t)(init_value_8));
fill_u8(dataB, dataB+SIZE, (uint8_t)(init_value_8));
test_inner_product_u8( data, dataB, SIZE, "uint_8 inner_product1 to uint_8");
}
void TestOneType_16()
{
int16_t data[SIZE];
int16_t dataB[SIZE];
fill_16(data, data+SIZE, (int16_t)(init_value_16));
fill_16(dataB, dataB+SIZE, (int16_t)(init_value_16));
test_inner_product_16( data, dataB, SIZE, "int_16 inner_product1 to int_16");
}
void TestOneType_u16()
{
uint16_t data[SIZE];
uint16_t dataB[SIZE];
fill_u16(data, data+SIZE, (uint16_t)(init_value_16));
fill_u16(dataB, dataB+SIZE, (uint16_t)(init_value_16));
test_inner_product_u16( data, dataB, SIZE, "uint_16 inner_product1 to uint_16");
}
void TestOneType_32()
{
int32_t data[SIZE];
int32_t dataB[SIZE];
fill_32(data, data+SIZE, (int32_t)(init_value_32));
fill_32(dataB, dataB+SIZE, (int32_t)(init_value_32));
test_inner_product_32( data, dataB, SIZE, "int_32 inner_product1 to int_32");
}
void TestOneType_u32()
{
uint32_t data[SIZE];
uint32_t dataB[SIZE];
fill_u32(data, data+SIZE, (uint32_t)(init_value_32));
fill_u32(dataB, dataB+SIZE, (uint32_t)(init_value_32));
test_inner_product_u32( data, dataB, SIZE, "uint_32 inner_product1 to uint_32");
}
void TestOneType_f16()
{
float data[SIZE];
float dataB[SIZE];
fill_f16(data, data+SIZE, (float)(init_value_f16));
fill_f16(dataB, dataB+SIZE, (float)(init_value_f16));
test_inner_product_f16( data, dataB, SIZE, "float inner_product1 to float");
}
void TestOneType_f32()
{
double data[SIZE];
double dataB[SIZE];
fill_f32(data, data+SIZE, (double)(init_value_f32));
fill_f32(dataB, dataB+SIZE, (double)(init_value_f32));
test_inner_product_f32( data, dataB, SIZE, "double inner_product1 to double");
}
/******************************************************************************/
/******************************************************************************/
int main(int argc, char** argv) {
// output command for documentation:
int i;
// for (i = 0; i < argc; ++i)
// printf("%s ", argv[i] );
// printf("\n");
if (argc > 1) iterations = atoi(argv[1]);
// if (argc > 2) init_value = (int32_t) atoi(argv[2]);
TestOneType_8();
TestOneType_u8();
TestOneType_16();
TestOneType_u16();
TestOneType_32();
TestOneType_u32();
TestOneType_f16();
TestOneType_f32();
return 0;
}
// the end
/******************************************************************************/
/******************************************************************************/

View File

@@ -0,0 +1,14 @@
TARGET = lookup_table
CFLAGS += -O3 -g
#根据SIMU宏选择串口波特率,0FPGA上板1仿真
CFLAGS += -DSIMU=0
C_SRCS := $(wildcard ./*.c )
OBJDIR = obj
COMMON_DIR = ../../../bsp
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
PICOLIBC_DIR=../../../../toolchains/picolibc
include ../../../bsp/common.mk

View File

@@ -0,0 +1,315 @@
/*
Copyright 2008-2009 Adobe Systems Incorporated
Copyright 2018 Chris Cox
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html )
Goal: Test performance of various idioms and optimizations for lookup tables.
Assumptions:
1) The compiler will optimize lookup table operations.
Unrolling will usually be needed to hide read latencies.
2) The compiler should recognize ineffecient lookup table idioms and substitute efficient methods.
Many different CPU architecture issues will require reading and writing words for best performance.
CPUs with...
cache write-back/write-combine delays.
store forwarding delays.
slow cache access relative to shifts/masks.
slow partial word (byte) access.
fast shift/mask operations.
On some CPUs, a lookup can be handled with vector instructions.
On some CPUs, special cache handling is needed (especially 2way caches).
TODO - lookup and interpolate (int16_t, int32_t, int64_t, float, double)
TODO - 2D and 3D LUTs, simple and interpolated
*/
/******************************************************************************/
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
/******************************************************************************/
/******************************************************************************/
//BSP板级支持包所需全局变量
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
clock_t start_time, end_time;
// this constant may need to be adjusted to give reasonable minimum times
// For best results, times should be about 1.0 seconds for the minimum test run
int base_iterations = 1;
int iterations = 1;
// 4000 items, or about 2..4k of data
// this is intended to remain within the L1 cache of most common CPUs
#define SIZE_SMALL 2000
// about 0.5..1M of data
// 没有那么大内存这里给50KB到100KB
// this is intended to be outside the L2 cache of most common CPUs
#define SIZE 50000
// initial value for filling our arrays, may be changed from the command line
int32_t init_value = 3;
/******************************************************************************/
// our global arrays of numbers
uint8_t inputData8[SIZE];
uint8_t resultData8[SIZE];
uint16_t inputData16[SIZE];
uint16_t resultData16[SIZE];
/******************************************************************************/
/******************************************************************************/
void fill_8(uint8_t * first, uint8_t * last, uint8_t value) {
while (first != last) *first++ = (uint8_t)(value);
}
void fill_16(uint16_t * first, uint16_t * last, uint16_t value) {
while (first != last) *first++ = (uint16_t)(value);
}
void fill_random_8(uint8_t * first, uint8_t * last) {
srand((unsigned int)init_value + 123 );
while (first != last) {
*first++ = (uint8_t)rand();
}
}
void fill_random_16(uint16_t * first, uint16_t * last) {
srand((unsigned int)init_value + 123 );
while (first != last) {
*first++ = (uint16_t)rand();
}
}
int max(int a, int b){
if(a > b)
return a;
else
return b;
}
/******************************************************************************/
/******************************************************************************/
// baseline - a trivial loop
void test_lut1_u8(const uint8_t* input, uint8_t *result, const int count, const uint8_t* LUT, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
for (int j = 0; j < count; ++j) {
result[j] = LUT[ input[j] ];
}
}
end_time = clock();
int j;
for (j = 0; j < count; ++j) {
if (result[j] != (uint8_t)(init_value)) {
printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
break;
}
}
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %d times\" %f sec\n",
label,
count,
time_cost);
}
void test_lut1_8(const int8_t* input, int8_t *result, const int count, const int8_t* LUT, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
for (int j = 0; j < count; ++j) {
result[j] = LUT[ input[j] ];
}
}
end_time = clock();
int j;
for (j = 0; j < count; ++j) {
if (result[j] != (int8_t)(init_value)) {
printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
break;
}
}
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %d times\" %f sec\n",
label,
count,
time_cost);
}
void test_lut1_u16(const uint16_t* input, uint16_t *result, const int count, const uint16_t* LUT, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
for (int j = 0; j < count; ++j) {
result[j] = LUT[ input[j] ];
}
}
end_time = clock();
int j;
for (j = 0; j < count; ++j) {
if (result[j] != (uint16_t)(init_value)) {
printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
break;
}
}
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %d times\" %f sec\n",
label,
count,
time_cost);
}
void test_lut1_16(const int16_t* input, int16_t *result, const int count, const int16_t* LUT, const char *label) {
start_time = clock();
for(int i = 0; i < iterations; ++i) {
for (int j = 0; j < count; ++j) {
result[j] = LUT[ input[j] ];
}
}
end_time = clock();
int j;
for (j = 0; j < count; ++j) {
if (result[j] != (int16_t)(init_value)) {
printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
break;
}
}
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %d times\" %f sec\n",
label,
count,
time_cost);
}
/******************************************************************************/
/******************************************************************************/
int main(int argc, char** argv) {
// output command for documentation:
int i;
// for (i = 0; i < argc; ++i)
// printf("%s ", argv[i] );
// printf("\n");
if (argc > 1) base_iterations = atoi(argv[1]);
if (argc > 2) init_value = (int32_t) atoi(argv[2]);
uint8_t myLUT8[ 256 ];
uint16_t myLUT16[ 65536 ];
fill_8(myLUT8, myLUT8+256, (uint8_t)(init_value));
fill_16(myLUT16, myLUT16+65536, (uint16_t)(init_value));
fill_random_8( inputData8, inputData8+SIZE );
fill_random_16( inputData16, inputData16+SIZE );
// uint8_t
iterations = base_iterations;
test_lut1_u8( inputData8, inputData8, SIZE_SMALL, myLUT8, "uint8_t lookup table1 small inplace");
test_lut1_u8( inputData8, resultData8, SIZE_SMALL, myLUT8, "uint8_t lookup table1 small");
iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
test_lut1_u8( inputData8, inputData8, SIZE, myLUT8, "uint8_t lookup table1 large inplace");
test_lut1_u8( inputData8, resultData8, SIZE, myLUT8, "uint8_t lookup table1 large");
// int8_t
iterations = base_iterations;
test_lut1_8( (int8_t*)inputData8, (int8_t*)inputData8, SIZE_SMALL, (int8_t*)(myLUT8+128), "int8_t lookup table1 small inplace");
test_lut1_8( (int8_t*)inputData8, (int8_t*)resultData8, SIZE_SMALL, (int8_t*)(myLUT8+128), "int8_t lookup table1 small");
iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
test_lut1_8( (int8_t*)inputData8, (int8_t*)inputData8, SIZE, (int8_t*)(myLUT8+128), "int8_t lookup table1 large inplace");
test_lut1_8( (int8_t*)inputData8, (int8_t*)resultData8, SIZE, (int8_t*)(myLUT8+128), "int8_t lookup table1 large");
// uint16_t
iterations = base_iterations;
test_lut1_u16( inputData16, inputData16, SIZE_SMALL, myLUT16, "uint16_t lookup table1 small inplace");
test_lut1_u16( inputData16, resultData16, SIZE_SMALL, myLUT16, "uint16_t lookup table1 small");
iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
test_lut1_u16( inputData16, inputData16, SIZE, myLUT16, "uint16_t lookup table1 large inplace");
test_lut1_u16( inputData16, resultData16, SIZE, myLUT16, "uint16_t lookup table1 large");
// int16_t
iterations = base_iterations;
test_lut1_16( (int16_t*)inputData16, (int16_t*)inputData16, SIZE_SMALL, (int16_t*)(myLUT16+32768), "int16_t lookup table1 small inplace");
test_lut1_16( (int16_t*)inputData16, (int16_t*)resultData16, SIZE_SMALL, (int16_t*)(myLUT16+32768), "int16_t lookup table1 small");
iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
test_lut1_16( (int16_t*)inputData16, (int16_t*)inputData16, SIZE, (int16_t*)(myLUT16+32768), "int16_t lookup table1 large inplace");
test_lut1_16( (int16_t*)inputData16, (int16_t*)resultData16, SIZE, (int16_t*)(myLUT16+32768), "int16_t lookup table1 large");
return 0;
}
// the end
/******************************************************************************/
/******************************************************************************/

View File

@@ -0,0 +1,14 @@
TARGET = loop_induction
CFLAGS += -O3 -g
#根据SIMU宏选择串口波特率,0FPGA上板1仿真
CFLAGS += -DSIMU=0
C_SRCS := $(wildcard ./*.c )
OBJDIR = obj
COMMON_DIR = ../../../bsp
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
PICOLIBC_DIR=../../../../toolchains/picolibc
include ../../../bsp/common.mk

View File

@@ -0,0 +1,131 @@
/*
Copyright 2018 Chris Cox
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html )
Goal: Examine performance optimizations related to loop induction variables.
Assumptions:
1) The compiler will normalize all loop types and optimize all equally.
(this is a necessary step before doing induction variable analysis)
2) The compiler will remove unused induction variables.
This could happen due to several optimizations.
2) The compiler will recognize induction variables with linear relations (x = a*b + c)
and optimize out redundant variables.
3) The compiler will apply strength reduction to induction variable usage.
4) The compiler will remove bounds checks by recognizing or adjusting loop limits.
(can be an explict loop optimization, or part of range propagation)
*/
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
/******************************************************************************/
//BSP板级支持包所需全局变量
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
clock_t start_time, end_time;
/******************************************************************************/
// this constant may need to be adjusted to give reasonable minimum times
// For best results, times should be about 1.0 seconds for the minimum test run
int iterations = 10;
// 32000 items, or about 128k of data
// this is intended to remain within the L2 cache of most common CPUs
const int SIZE = 32000;
// initial value for filling our arrays, may be changed from the command line
int init_value = 3;
/******************************************************************************/
void fill_random(int32_t * first, int32_t * last) {
while (first != last) {
*first++ = (int32_t)rand();
}
}
/******************************************************************************/
/******************************************************************************/
void test_copy(const int32_t *source, int32_t *dest, int count, const char *label) {
int i;
fill_random( dest, dest+count );
start_time = clock();
for(i = 0; i < iterations; ++i) {
int i, j, k;
for ( i=0, j=0, k=0; k < count; ++i, ++j, ++k ) {
dest[i] = source[j];
}
}
end_time = clock();
if ( memcmp(dest, source, count*sizeof(int32_t)) != 0 )
printf("test %s failed\n", label);
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %d items\" %f sec\n",
label,
count,
time_cost);
}
/******************************************************************************/
/******************************************************************************/
int main(int argc, char** argv) {
// output command for documentation:
int i;
// for (i = 0; i < argc; ++i)
// printf("%s ", argv[i] );
// printf("\n");
if (argc > 1) iterations = atoi(argv[1]);
if (argc > 2) init_value = (int) atoi(argv[2]);
int32_t intSrc[ SIZE ];
int32_t intDst[ SIZE ];
srand( (unsigned int)init_value + 123);
fill_random( intSrc, intSrc+SIZE );
test_copy( &intSrc[0], &intDst[0], SIZE, "int32_t for induction copy" );
return 0;
}
// the end
/******************************************************************************/
/******************************************************************************/

View File

@@ -0,0 +1,14 @@
TARGET = memcmp
CFLAGS += -O3 -g
#根据SIMU宏选择串口波特率,0FPGA上板1仿真
CFLAGS += -DSIMU=0
C_SRCS := $(wildcard ./*.c )
OBJDIR = obj
COMMON_DIR = ../../../bsp
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
PICOLIBC_DIR=../../../../toolchains/picolibc
include ../../../bsp/common.mk

View File

@@ -0,0 +1,177 @@
/*
Copyright 2008-2009 Adobe Systems Incorporated
Copyright 2018 Chris Cox
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html )
Goal: Test compiler optimizations related to memcmp and hand coded memcmp loops.
Assumptions:
1) The compiler will recognize memcmp like loops and optimize appropriately.
This could be subtitution of calls to memcmp,
or it could be just optimizing the loop to get the best throughput.
On modern systems, cache hinting is usually required for best throughput.
2) The library function memcmp should be optimized for small, medium, and large buffers.
ie: low overhead for smaller buffer, highly hinted for large buffers.
3) The STL functions equal and mismatch should be optimized for small, medium, and large buffers.
ie: low overhead for smaller buffers, highly hinted for large buffers.
NOTE - on some OSes, memcmp calls into the VM system to test for shared pages
thus running faster than the DRAM bandwidth would allow on large arrays
However, on those OSes, calling memcmp can hit mutexes and slow down
significantly when called from threads.
NOTE - Linux memcmp returns 0, +-1 instead of the actual difference
NOTE - and sometimes Linux memcmp returns 0, +-256 instead of the actual difference
TODO - test performance of unaligned buffers
*/
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
/******************************************************************************/
//BSP板级支持包所需全局变量
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
clock_t start_time, end_time;
// this constant may need to be adjusted to give reasonable minimum times
// For best results, times should be about 1.0 seconds for the minimum test run
int iterations = 1;
// 64 Megabytes, intended to be larger than L2 cache on common CPUs
// needs to be divisible by 8
// 没有这么大内存给30KB
#define SIZE_4K 4096
// #define SIZE_3M 3145728
#define SIZE_3M 30720
// initial value for filling our arrays, may be changed from the command line
uint8_t init_value = 3;
/******************************************************************************/
/******************************************************************************/
void fill(uint8_t * first, uint8_t * last, uint8_t value) {
while (first != last) *first++ = value;
}
int forloop_memcmp( const void *first, const void *second, size_t bytes ){
const uint8_t *first_byte = (const uint8_t *)first;
const uint8_t *second_byte = (const uint8_t *)second;
int x;
for (x = 0; x < bytes; ++x) {
if (first_byte[x] != second_byte[x]) {
return (first_byte[x] - second_byte[x]);
}
}
return 0;
}
/******************************************************************************/
/******************************************************************************/
void test_memcmp(const uint8_t *first, const uint8_t *second, int count, bool expected_result) {
int i;
int bytes = count * sizeof(uint8_t);
start_time = clock();
for(i = 0; i < iterations; ++i) {
// sigh, Linux memcmp is wonky - some return 1, some return 256
bool result = (forloop_memcmp( first, second, bytes ) != 0) ;
// moving this test out of the loop causes unwanted overoptimization
if ( result != expected_result )
printf("test %s by %d failed (got %d instead of %d)\n", "for loop compare", count, (int)result, (int)expected_result );
}
end_time = clock();
}
/******************************************************************************/
void test_memcmp_sizes(const uint8_t *first, const uint8_t *second, int max_count, bool result) {
int i = max_count * sizeof(uint8_t);
test_memcmp( first, second, max_count, result);
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s %d bytes\" compare result: %s %f sec\n",
"for loop compare",
i,
result ? "false" : "true",
time_cost);
}
/******************************************************************************/
/******************************************************************************/
// our global arrays of numbers to be operated upon
uint8_t data8u[SIZE_3M/sizeof(uint8_t)];
int alignment_pad = 1024;
uint8_t data8u_dest[SIZE_3M/sizeof(uint8_t) + 1024]; // leave some room for alignment testing
/******************************************************************************/
/******************************************************************************/
int main(int argc, char** argv) {
// output command for documentation:
int i;
if (argc > 1) iterations = atoi(argv[1]);
if (argc > 2) init_value = (int32_t) atoi(argv[2]);
fill( data8u, data8u+(SIZE_3M/sizeof(uint8_t)), (uint8_t)(init_value) );
fill( data8u_dest, data8u_dest+(SIZE_3M/sizeof(uint8_t) + alignment_pad), (uint8_t)(init_value) );
test_memcmp_sizes( data8u, data8u_dest, SIZE_3M/sizeof(uint8_t), false);
data8u[(SIZE_3M/sizeof(uint8_t))-1] += 1; // last byte in the array
test_memcmp_sizes( data8u, data8u_dest, SIZE_3M/sizeof(uint8_t), true);
/*
test_memcmp_sizes( data8u, data8u_dest, SIZE_1M/sizeof(uint8_t), false);
data8u[(SIZE_1M/sizeof(uint8_t))-1] += 1; // last byte in the array
test_memcmp_sizes( data8u, data8u_dest, SIZE_1M/sizeof(uint8_t), true);
*/
test_memcmp_sizes( data8u, data8u_dest, SIZE_4K/sizeof(uint8_t), false);
data8u[(SIZE_4K/sizeof(uint8_t))-1] += 1; // last byte in the array
test_memcmp_sizes( data8u, data8u_dest, SIZE_4K/sizeof(uint8_t), true);
return 0;
}
// the end
/******************************************************************************/
/******************************************************************************/

View File

@@ -0,0 +1,14 @@
TARGET = minmax_sequence
CFLAGS += -O3 -g
#根据SIMU宏选择串口波特率,0FPGA上板1仿真
CFLAGS += -DSIMU=0
C_SRCS := $(wildcard ./*.c )
OBJDIR = obj
COMMON_DIR = ../../../bsp
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
PICOLIBC_DIR=../../../../toolchains/picolibc
include ../../../bsp/common.mk

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,14 @@
TARGET = product_sequence
CFLAGS += -O3 -g
#根据SIMU宏选择串口波特率,0FPGA上板1仿真
CFLAGS += -DSIMU=0
C_SRCS := $(wildcard ./*.c )
OBJDIR = obj
COMMON_DIR = ../../../bsp
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
PICOLIBC_DIR=../../../../toolchains/picolibc
include ../../../bsp/common.mk

View File

@@ -0,0 +1,169 @@
/*
Copyright 2008 Adobe Systems Incorporated
Copyright 2019 Chris Cox
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
or a copy at http://stlab.adobe.com/licenses.html )
Goal: Test performance of various idioms for calculating the product of a sequence.
Assumptions:
1) The compiler will optimize product operations.
2) The compiler may recognize ineffecient product idioms and substitute efficient methods.
*/
/******************************************************************************/
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdbool.h>
/******************************************************************************/
//BSP板级支持包所需全局变量
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
clock_t start_time, end_time;
/******************************************************************************/
// this constant may need to be adjusted to give reasonable minimum times
// For best results, times should be about 1.0 seconds for the minimum test run
int iterations = 10;
// 4000 items, or about 32k of data
// this is intended to remain within the L2 cache of most common CPUs
const int SIZE = 4000;
// initial value for filling our arrays, may be changed from the command line
double init_value = 2.1;
/******************************************************************************/
/******************************************************************************/
void fill_f16(float * first, float * last, float value) {
while (first != last) *first++ = (float)(value);
}
void fill_f32(double * first, double * last, double value) {
while (first != last) *first++ = (double)(value);
}
void testOneFunction_f16(const float* first, const int count, const char * label) {
int i;
start_time = clock();
for(i = 0; i < iterations; ++i) {
float result = (float)(1);
for (int j = 0; j < count; ++j) {
result = result * first[j];
}
if ( fabs( result - pow(init_value,(double)SIZE) ) > 1.0e-6 )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %d items\" %f sec\n",
label,
count,
time_cost);
}
void testOneFunction_f32(const double* first, const int count, const char * label) {
int i;
start_time = clock();
for(i = 0; i < iterations; ++i) {
double result = (double)(1);
for (int j = 0; j < count; ++j) {
result = result * first[j];
}
if ( fabs( result - pow(init_value,(double)SIZE) ) > 1.0e-6 )
printf("test %s failed\n", label);
}
// need the labels to remain valid until we print the summary
end_time = clock();
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
printf("\"%s, %d items\" %f sec\n",
label,
count,
time_cost);
}
/******************************************************************************/
void TestOneType_f16()
{
float data[SIZE];
fill_f16(data, data+SIZE, (float)(init_value));
testOneFunction_f16( data, SIZE, "float product sequence1" );
}
void TestOneType_f32()
{
double data[SIZE];
fill_f32(data, data+SIZE, (double)(init_value));
testOneFunction_f32( data, SIZE, "double product sequence1" );
}
/******************************************************************************/
/******************************************************************************/
int main(int argc, char** argv) {
// output command for documentation:
int i;
// for (i = 0; i < argc; ++i)
// printf("%s ", argv[i] );
// printf("\n");
if (argc > 1) iterations = atoi(argv[1]);
if (argc > 2) init_value = (double) atof(argv[2]);
TestOneType_f16();
TestOneType_f32();
return 0;
}
// the end
/******************************************************************************/
/******************************************************************************/