initial commit
This commit is contained in:
14
sdk/software/examples/c_prg/inner_product/Makefile
Normal file
14
sdk/software/examples/c_prg/inner_product/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
TARGET = inner_product
|
||||
|
||||
CFLAGS += -O3 -g
|
||||
|
||||
#根据SIMU宏选择串口波特率,0:FPGA上板;1:仿真
|
||||
CFLAGS += -DSIMU=0
|
||||
|
||||
C_SRCS := $(wildcard ./*.c )
|
||||
|
||||
OBJDIR = obj
|
||||
COMMON_DIR = ../../../bsp
|
||||
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
|
||||
PICOLIBC_DIR=../../../../toolchains/picolibc
|
||||
include ../../../bsp/common.mk
|
||||
446
sdk/software/examples/c_prg/inner_product/inner_product.c
Normal file
446
sdk/software/examples/c_prg/inner_product/inner_product.c
Normal file
@@ -0,0 +1,446 @@
|
||||
/*
|
||||
Copyright 2008 Adobe Systems Incorporated
|
||||
Copyright 2018-2019 Chris Cox
|
||||
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
|
||||
or a copy at http://stlab.adobe.com/licenses.html )
|
||||
|
||||
|
||||
Goal: Test performance of various idioms for calculating the inner product of two sequences.
|
||||
|
||||
NOTE: Inner products are common in mathematical and geometry processing applications,
|
||||
plus some audio and image processing.
|
||||
|
||||
|
||||
Assumptions:
|
||||
1) The compiler will optimize inner product operations.
|
||||
|
||||
2) The compiler may recognize ineffecient inner product idioms
|
||||
and substitute efficient methods when it can.
|
||||
NOTE: the best method is highly dependent on the data types and CPU architecture
|
||||
|
||||
3) std::inner_product will be well optimized for all types and containers.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
#include <time.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
//BSP板级支持包所需全局变量
|
||||
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
|
||||
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
|
||||
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
|
||||
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
|
||||
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
|
||||
|
||||
clock_t start_time, end_time;
|
||||
|
||||
// this constant may need to be adjusted to give reasonable minimum times
|
||||
// For best results, times should be about 1.0 seconds for the minimum test run
|
||||
int iterations = 5;
|
||||
|
||||
|
||||
// 8000 items, or between 8 and 64k of data
|
||||
// this is intended to remain within the L2 cache of most common CPUs
|
||||
const int SIZE = 8000;
|
||||
|
||||
|
||||
// initial value for filling our arrays, may be changed from the command line
|
||||
int32_t init_value_8 = 3;
|
||||
int32_t init_value_16 = 211;
|
||||
int32_t init_value_32 = 1065;
|
||||
float init_value_f16 = 5.0;
|
||||
double init_value_f32 = 365.0;
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
void fill_8(int8_t * first, int8_t * last, int8_t value) {
|
||||
while (first != last) *first++ = (int8_t)(value);
|
||||
}
|
||||
|
||||
void fill_u8(uint8_t * first, uint8_t * last, uint8_t value) {
|
||||
while (first != last) *first++ = (uint8_t)(value);
|
||||
}
|
||||
|
||||
void fill_16(int16_t * first, int16_t * last, int16_t value) {
|
||||
while (first != last) *first++ = (int16_t)(value);
|
||||
}
|
||||
|
||||
void fill_u16(uint16_t * first, uint16_t * last, uint16_t value) {
|
||||
while (first != last) *first++ = (uint16_t)(value);
|
||||
}
|
||||
|
||||
void fill_32(int32_t * first, int32_t * last, int32_t value) {
|
||||
while (first != last) *first++ = (int32_t)(value);
|
||||
}
|
||||
|
||||
void fill_u32(uint32_t * first, uint32_t * last, uint32_t value) {
|
||||
while (first != last) *first++ = (uint32_t)(value);
|
||||
}
|
||||
|
||||
void fill_f16(float * first, float * last, float value) {
|
||||
while (first != last) *first++ = (float)(value);
|
||||
}
|
||||
|
||||
void fill_f32(double * first, double * last, double value) {
|
||||
while (first != last) *first++ = (double)(value);
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
// a trivial for loop
|
||||
|
||||
void test_inner_product_8( const int8_t* first, const int8_t* second, const size_t count, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
|
||||
int8_t sum = 0 ;
|
||||
for (size_t j = 0; j < count; ++j) {
|
||||
sum += first[j] * second[j];
|
||||
}
|
||||
|
||||
//check_sum( sum, label );
|
||||
int8_t target = (int8_t)(init_value_8)*(int8_t)(init_value_8)*SIZE;
|
||||
if ( abs( sum - target ) > (int8_t)(1.0e-6) )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("CLOCKS_PER_SEC=%d\n",CLOCKS_PER_SEC);
|
||||
printf("\"%s, %lu items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
void test_inner_product_u8( const uint8_t* first, const uint8_t* second, const size_t count, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
|
||||
uint8_t sum = 0 ;
|
||||
for (size_t j = 0; j < count; ++j) {
|
||||
sum += first[j] * second[j];
|
||||
}
|
||||
|
||||
//check_sum( sum, label );
|
||||
uint8_t target = (uint8_t)(init_value_8)*(uint8_t)(init_value_8)*SIZE;
|
||||
if ( ( sum - target ) > (uint8_t)(1.0e-6) )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %lu items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
void test_inner_product_16( const int16_t* first, const int16_t* second, const size_t count, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
|
||||
int16_t sum = 0 ;
|
||||
for (size_t j = 0; j < count; ++j) {
|
||||
sum += first[j] * second[j];
|
||||
}
|
||||
|
||||
//check_sum( sum, label );
|
||||
int16_t target = (int16_t)(init_value_16)*(int16_t)(init_value_16)*SIZE;
|
||||
if ( abs( sum - target ) > (int16_t)(1.0e-6) )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %lu items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
void test_inner_product_u16( const uint16_t* first, const uint16_t* second, const size_t count, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
|
||||
uint16_t sum = 0 ;
|
||||
for (size_t j = 0; j < count; ++j) {
|
||||
sum += first[j] * second[j];
|
||||
}
|
||||
|
||||
//check_sum( sum, label );
|
||||
uint16_t target = (uint16_t)(init_value_16)*(uint16_t)(init_value_16)*SIZE;
|
||||
if ( ( sum - target ) > (uint16_t)(1.0e-6) )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %lu items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
void test_inner_product_32( const int32_t* first, const int32_t* second, const size_t count, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
|
||||
int32_t sum = 0 ;
|
||||
for (size_t j = 0; j < count; ++j) {
|
||||
sum += first[j] * second[j];
|
||||
}
|
||||
|
||||
//check_sum( sum, label );
|
||||
int32_t target = (int32_t)(init_value_32)*(int32_t)(init_value_32)*SIZE;
|
||||
if ( abs( sum - target ) > (int32_t)(1.0e-6) )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %lu items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
void test_inner_product_u32( const uint32_t* first, const uint32_t* second, const size_t count, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
|
||||
uint32_t sum = 0 ;
|
||||
for (size_t j = 0; j < count; ++j) {
|
||||
sum += first[j] * second[j];
|
||||
}
|
||||
|
||||
//check_sum( sum, label );
|
||||
uint32_t target = (uint32_t)(init_value_32)*(uint32_t)(init_value_32)*SIZE;
|
||||
if ( ( sum - target ) > (uint32_t)(1.0e-6) )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %lu items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
void test_inner_product_f16( const float* first, const float* second, const size_t count, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
|
||||
float sum = 0 ;
|
||||
for (size_t j = 0; j < count; ++j) {
|
||||
sum += first[j] * second[j];
|
||||
}
|
||||
|
||||
//check_sum( sum, label );
|
||||
float target = (float)(init_value_f16)*(float)(init_value_f16)*SIZE;
|
||||
if ( fabs( sum - target ) > (float)(1.0e-6) )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %lu items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
void test_inner_product_f32( const double* first, const double* second, const size_t count, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
|
||||
double sum = 0 ;
|
||||
for (size_t j = 0; j < count; ++j) {
|
||||
sum += first[j] * second[j];
|
||||
}
|
||||
|
||||
//check_sum( sum, label );
|
||||
double target = (double)(init_value_f32)*(double)(init_value_f32)*SIZE;
|
||||
if ( fabs( sum - target ) > (double)(1.0e-6) )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %lu items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
// NOTE - can't make generic template template argument without C++17
|
||||
// I would like to have TestOneFunction to handle all the types and if's, but need to use different types with it inside
|
||||
// see sum_sequence.cpp
|
||||
|
||||
|
||||
void TestOneType_8()
|
||||
{
|
||||
int8_t data[SIZE];
|
||||
int8_t dataB[SIZE];
|
||||
|
||||
fill_8(data, data+SIZE, (int8_t)(init_value_8));
|
||||
fill_8(dataB, dataB+SIZE, (int8_t)(init_value_8));
|
||||
|
||||
test_inner_product_8( data, dataB, SIZE, "int_8 inner_product1 to int_8");
|
||||
}
|
||||
|
||||
void TestOneType_u8()
|
||||
{
|
||||
uint8_t data[SIZE];
|
||||
uint8_t dataB[SIZE];
|
||||
|
||||
fill_u8(data, data+SIZE, (uint8_t)(init_value_8));
|
||||
fill_u8(dataB, dataB+SIZE, (uint8_t)(init_value_8));
|
||||
|
||||
test_inner_product_u8( data, dataB, SIZE, "uint_8 inner_product1 to uint_8");
|
||||
}
|
||||
|
||||
|
||||
void TestOneType_16()
|
||||
{
|
||||
int16_t data[SIZE];
|
||||
int16_t dataB[SIZE];
|
||||
|
||||
fill_16(data, data+SIZE, (int16_t)(init_value_16));
|
||||
fill_16(dataB, dataB+SIZE, (int16_t)(init_value_16));
|
||||
|
||||
test_inner_product_16( data, dataB, SIZE, "int_16 inner_product1 to int_16");
|
||||
}
|
||||
|
||||
void TestOneType_u16()
|
||||
{
|
||||
uint16_t data[SIZE];
|
||||
uint16_t dataB[SIZE];
|
||||
|
||||
fill_u16(data, data+SIZE, (uint16_t)(init_value_16));
|
||||
fill_u16(dataB, dataB+SIZE, (uint16_t)(init_value_16));
|
||||
|
||||
test_inner_product_u16( data, dataB, SIZE, "uint_16 inner_product1 to uint_16");
|
||||
}
|
||||
|
||||
void TestOneType_32()
|
||||
{
|
||||
int32_t data[SIZE];
|
||||
int32_t dataB[SIZE];
|
||||
|
||||
fill_32(data, data+SIZE, (int32_t)(init_value_32));
|
||||
fill_32(dataB, dataB+SIZE, (int32_t)(init_value_32));
|
||||
|
||||
test_inner_product_32( data, dataB, SIZE, "int_32 inner_product1 to int_32");
|
||||
}
|
||||
|
||||
void TestOneType_u32()
|
||||
{
|
||||
uint32_t data[SIZE];
|
||||
uint32_t dataB[SIZE];
|
||||
|
||||
fill_u32(data, data+SIZE, (uint32_t)(init_value_32));
|
||||
fill_u32(dataB, dataB+SIZE, (uint32_t)(init_value_32));
|
||||
|
||||
test_inner_product_u32( data, dataB, SIZE, "uint_32 inner_product1 to uint_32");
|
||||
}
|
||||
|
||||
void TestOneType_f16()
|
||||
{
|
||||
float data[SIZE];
|
||||
float dataB[SIZE];
|
||||
|
||||
fill_f16(data, data+SIZE, (float)(init_value_f16));
|
||||
fill_f16(dataB, dataB+SIZE, (float)(init_value_f16));
|
||||
|
||||
test_inner_product_f16( data, dataB, SIZE, "float inner_product1 to float");
|
||||
}
|
||||
|
||||
void TestOneType_f32()
|
||||
{
|
||||
double data[SIZE];
|
||||
double dataB[SIZE];
|
||||
|
||||
fill_f32(data, data+SIZE, (double)(init_value_f32));
|
||||
fill_f32(dataB, dataB+SIZE, (double)(init_value_f32));
|
||||
|
||||
test_inner_product_f32( data, dataB, SIZE, "double inner_product1 to double");
|
||||
}
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
// output command for documentation:
|
||||
int i;
|
||||
// for (i = 0; i < argc; ++i)
|
||||
// printf("%s ", argv[i] );
|
||||
// printf("\n");
|
||||
|
||||
if (argc > 1) iterations = atoi(argv[1]);
|
||||
// if (argc > 2) init_value = (int32_t) atoi(argv[2]);
|
||||
|
||||
|
||||
TestOneType_8();
|
||||
TestOneType_u8();
|
||||
TestOneType_16();
|
||||
TestOneType_u16();
|
||||
TestOneType_32();
|
||||
TestOneType_u32();
|
||||
|
||||
TestOneType_f16();
|
||||
TestOneType_f32();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// the end
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
14
sdk/software/examples/c_prg/lookup_table/Makefile
Normal file
14
sdk/software/examples/c_prg/lookup_table/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
TARGET = lookup_table
|
||||
|
||||
CFLAGS += -O3 -g
|
||||
|
||||
#根据SIMU宏选择串口波特率,0:FPGA上板;1:仿真
|
||||
CFLAGS += -DSIMU=0
|
||||
|
||||
C_SRCS := $(wildcard ./*.c )
|
||||
|
||||
OBJDIR = obj
|
||||
COMMON_DIR = ../../../bsp
|
||||
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
|
||||
PICOLIBC_DIR=../../../../toolchains/picolibc
|
||||
include ../../../bsp/common.mk
|
||||
315
sdk/software/examples/c_prg/lookup_table/lookup_table.c
Normal file
315
sdk/software/examples/c_prg/lookup_table/lookup_table.c
Normal file
@@ -0,0 +1,315 @@
|
||||
/*
|
||||
Copyright 2008-2009 Adobe Systems Incorporated
|
||||
Copyright 2018 Chris Cox
|
||||
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
|
||||
or a copy at http://stlab.adobe.com/licenses.html )
|
||||
|
||||
|
||||
Goal: Test performance of various idioms and optimizations for lookup tables.
|
||||
|
||||
|
||||
Assumptions:
|
||||
1) The compiler will optimize lookup table operations.
|
||||
Unrolling will usually be needed to hide read latencies.
|
||||
|
||||
2) The compiler should recognize ineffecient lookup table idioms and substitute efficient methods.
|
||||
Many different CPU architecture issues will require reading and writing words for best performance.
|
||||
CPUs with...
|
||||
cache write-back/write-combine delays.
|
||||
store forwarding delays.
|
||||
slow cache access relative to shifts/masks.
|
||||
slow partial word (byte) access.
|
||||
fast shift/mask operations.
|
||||
On some CPUs, a lookup can be handled with vector instructions.
|
||||
On some CPUs, special cache handling is needed (especially 2way caches).
|
||||
|
||||
|
||||
|
||||
|
||||
TODO - lookup and interpolate (int16_t, int32_t, int64_t, float, double)
|
||||
TODO - 2D and 3D LUTs, simple and interpolated
|
||||
|
||||
*/
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
#include <time.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
//BSP板级支持包所需全局变量
|
||||
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
|
||||
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
|
||||
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
|
||||
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
|
||||
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
|
||||
|
||||
clock_t start_time, end_time;
|
||||
|
||||
// this constant may need to be adjusted to give reasonable minimum times
|
||||
// For best results, times should be about 1.0 seconds for the minimum test run
|
||||
int base_iterations = 1;
|
||||
int iterations = 1;
|
||||
|
||||
// 4000 items, or about 2..4k of data
|
||||
// this is intended to remain within the L1 cache of most common CPUs
|
||||
#define SIZE_SMALL 2000
|
||||
|
||||
// about 0.5..1M of data
|
||||
// 没有那么大内存,这里给50KB到100KB
|
||||
// this is intended to be outside the L2 cache of most common CPUs
|
||||
#define SIZE 50000
|
||||
|
||||
// initial value for filling our arrays, may be changed from the command line
|
||||
int32_t init_value = 3;
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
// our global arrays of numbers
|
||||
|
||||
uint8_t inputData8[SIZE];
|
||||
uint8_t resultData8[SIZE];
|
||||
|
||||
uint16_t inputData16[SIZE];
|
||||
uint16_t resultData16[SIZE];
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
void fill_8(uint8_t * first, uint8_t * last, uint8_t value) {
|
||||
while (first != last) *first++ = (uint8_t)(value);
|
||||
}
|
||||
|
||||
void fill_16(uint16_t * first, uint16_t * last, uint16_t value) {
|
||||
while (first != last) *first++ = (uint16_t)(value);
|
||||
}
|
||||
|
||||
void fill_random_8(uint8_t * first, uint8_t * last) {
|
||||
srand((unsigned int)init_value + 123 );
|
||||
while (first != last) {
|
||||
*first++ = (uint8_t)rand();
|
||||
}
|
||||
}
|
||||
|
||||
void fill_random_16(uint16_t * first, uint16_t * last) {
|
||||
srand((unsigned int)init_value + 123 );
|
||||
while (first != last) {
|
||||
*first++ = (uint16_t)rand();
|
||||
}
|
||||
}
|
||||
|
||||
int max(int a, int b){
|
||||
if(a > b)
|
||||
return a;
|
||||
else
|
||||
return b;
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
|
||||
// baseline - a trivial loop
|
||||
|
||||
void test_lut1_u8(const uint8_t* input, uint8_t *result, const int count, const uint8_t* LUT, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
for (int j = 0; j < count; ++j) {
|
||||
result[j] = LUT[ input[j] ];
|
||||
}
|
||||
}
|
||||
|
||||
end_time = clock();
|
||||
|
||||
int j;
|
||||
|
||||
for (j = 0; j < count; ++j) {
|
||||
if (result[j] != (uint8_t)(init_value)) {
|
||||
printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %d times\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
|
||||
}
|
||||
|
||||
void test_lut1_8(const int8_t* input, int8_t *result, const int count, const int8_t* LUT, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
for (int j = 0; j < count; ++j) {
|
||||
result[j] = LUT[ input[j] ];
|
||||
}
|
||||
}
|
||||
|
||||
end_time = clock();
|
||||
|
||||
int j;
|
||||
|
||||
for (j = 0; j < count; ++j) {
|
||||
if (result[j] != (int8_t)(init_value)) {
|
||||
printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %d times\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
|
||||
}
|
||||
|
||||
void test_lut1_u16(const uint16_t* input, uint16_t *result, const int count, const uint16_t* LUT, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
for (int j = 0; j < count; ++j) {
|
||||
result[j] = LUT[ input[j] ];
|
||||
}
|
||||
}
|
||||
|
||||
end_time = clock();
|
||||
|
||||
int j;
|
||||
|
||||
for (j = 0; j < count; ++j) {
|
||||
if (result[j] != (uint16_t)(init_value)) {
|
||||
printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %d times\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
|
||||
}
|
||||
|
||||
void test_lut1_16(const int16_t* input, int16_t *result, const int count, const int16_t* LUT, const char *label) {
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(int i = 0; i < iterations; ++i) {
|
||||
for (int j = 0; j < count; ++j) {
|
||||
result[j] = LUT[ input[j] ];
|
||||
}
|
||||
}
|
||||
|
||||
end_time = clock();
|
||||
|
||||
int j;
|
||||
|
||||
for (j = 0; j < count; ++j) {
|
||||
if (result[j] != (int16_t)(init_value)) {
|
||||
printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
printf("\"%s, %d times\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
// output command for documentation:
|
||||
int i;
|
||||
// for (i = 0; i < argc; ++i)
|
||||
// printf("%s ", argv[i] );
|
||||
// printf("\n");
|
||||
|
||||
if (argc > 1) base_iterations = atoi(argv[1]);
|
||||
if (argc > 2) init_value = (int32_t) atoi(argv[2]);
|
||||
|
||||
uint8_t myLUT8[ 256 ];
|
||||
uint16_t myLUT16[ 65536 ];
|
||||
|
||||
|
||||
fill_8(myLUT8, myLUT8+256, (uint8_t)(init_value));
|
||||
fill_16(myLUT16, myLUT16+65536, (uint16_t)(init_value));
|
||||
|
||||
fill_random_8( inputData8, inputData8+SIZE );
|
||||
fill_random_16( inputData16, inputData16+SIZE );
|
||||
|
||||
|
||||
// uint8_t
|
||||
iterations = base_iterations;
|
||||
|
||||
test_lut1_u8( inputData8, inputData8, SIZE_SMALL, myLUT8, "uint8_t lookup table1 small inplace");
|
||||
test_lut1_u8( inputData8, resultData8, SIZE_SMALL, myLUT8, "uint8_t lookup table1 small");
|
||||
|
||||
iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
|
||||
|
||||
test_lut1_u8( inputData8, inputData8, SIZE, myLUT8, "uint8_t lookup table1 large inplace");
|
||||
test_lut1_u8( inputData8, resultData8, SIZE, myLUT8, "uint8_t lookup table1 large");
|
||||
|
||||
|
||||
|
||||
// int8_t
|
||||
iterations = base_iterations;
|
||||
|
||||
test_lut1_8( (int8_t*)inputData8, (int8_t*)inputData8, SIZE_SMALL, (int8_t*)(myLUT8+128), "int8_t lookup table1 small inplace");
|
||||
test_lut1_8( (int8_t*)inputData8, (int8_t*)resultData8, SIZE_SMALL, (int8_t*)(myLUT8+128), "int8_t lookup table1 small");
|
||||
|
||||
iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
|
||||
|
||||
test_lut1_8( (int8_t*)inputData8, (int8_t*)inputData8, SIZE, (int8_t*)(myLUT8+128), "int8_t lookup table1 large inplace");
|
||||
test_lut1_8( (int8_t*)inputData8, (int8_t*)resultData8, SIZE, (int8_t*)(myLUT8+128), "int8_t lookup table1 large");
|
||||
|
||||
|
||||
// uint16_t
|
||||
iterations = base_iterations;
|
||||
|
||||
test_lut1_u16( inputData16, inputData16, SIZE_SMALL, myLUT16, "uint16_t lookup table1 small inplace");
|
||||
test_lut1_u16( inputData16, resultData16, SIZE_SMALL, myLUT16, "uint16_t lookup table1 small");
|
||||
|
||||
iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
|
||||
|
||||
test_lut1_u16( inputData16, inputData16, SIZE, myLUT16, "uint16_t lookup table1 large inplace");
|
||||
test_lut1_u16( inputData16, resultData16, SIZE, myLUT16, "uint16_t lookup table1 large");
|
||||
|
||||
// int16_t
|
||||
iterations = base_iterations;
|
||||
|
||||
test_lut1_16( (int16_t*)inputData16, (int16_t*)inputData16, SIZE_SMALL, (int16_t*)(myLUT16+32768), "int16_t lookup table1 small inplace");
|
||||
test_lut1_16( (int16_t*)inputData16, (int16_t*)resultData16, SIZE_SMALL, (int16_t*)(myLUT16+32768), "int16_t lookup table1 small");
|
||||
|
||||
iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
|
||||
|
||||
test_lut1_16( (int16_t*)inputData16, (int16_t*)inputData16, SIZE, (int16_t*)(myLUT16+32768), "int16_t lookup table1 large inplace");
|
||||
test_lut1_16( (int16_t*)inputData16, (int16_t*)resultData16, SIZE, (int16_t*)(myLUT16+32768), "int16_t lookup table1 large");
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// the end
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
14
sdk/software/examples/c_prg/loop_induction/Makefile
Normal file
14
sdk/software/examples/c_prg/loop_induction/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
TARGET = loop_induction
|
||||
|
||||
CFLAGS += -O3 -g
|
||||
|
||||
#根据SIMU宏选择串口波特率,0:FPGA上板;1:仿真
|
||||
CFLAGS += -DSIMU=0
|
||||
|
||||
C_SRCS := $(wildcard ./*.c )
|
||||
|
||||
OBJDIR = obj
|
||||
COMMON_DIR = ../../../bsp
|
||||
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
|
||||
PICOLIBC_DIR=../../../../toolchains/picolibc
|
||||
include ../../../bsp/common.mk
|
||||
131
sdk/software/examples/c_prg/loop_induction/loop_induction.c
Normal file
131
sdk/software/examples/c_prg/loop_induction/loop_induction.c
Normal file
@@ -0,0 +1,131 @@
|
||||
/*
|
||||
Copyright 2018 Chris Cox
|
||||
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
|
||||
or a copy at http://stlab.adobe.com/licenses.html )
|
||||
|
||||
|
||||
Goal: Examine performance optimizations related to loop induction variables.
|
||||
|
||||
|
||||
Assumptions:
|
||||
1) The compiler will normalize all loop types and optimize all equally.
|
||||
(this is a necessary step before doing induction variable analysis)
|
||||
|
||||
2) The compiler will remove unused induction variables.
|
||||
This could happen due to several optimizations.
|
||||
|
||||
2) The compiler will recognize induction variables with linear relations (x = a*b + c)
|
||||
and optimize out redundant variables.
|
||||
|
||||
3) The compiler will apply strength reduction to induction variable usage.
|
||||
|
||||
4) The compiler will remove bounds checks by recognizing or adjusting loop limits.
|
||||
(can be an explict loop optimization, or part of range propagation)
|
||||
|
||||
|
||||
*/
|
||||
|
||||
#include <time.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
//BSP板级支持包所需全局变量
|
||||
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
|
||||
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
|
||||
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
|
||||
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
|
||||
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
|
||||
|
||||
clock_t start_time, end_time;
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
// this constant may need to be adjusted to give reasonable minimum times
|
||||
// For best results, times should be about 1.0 seconds for the minimum test run
|
||||
int iterations = 10;
|
||||
|
||||
|
||||
// 32000 items, or about 128k of data
|
||||
// this is intended to remain within the L2 cache of most common CPUs
|
||||
const int SIZE = 32000;
|
||||
|
||||
|
||||
// initial value for filling our arrays, may be changed from the command line
|
||||
int init_value = 3;
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
void fill_random(int32_t * first, int32_t * last) {
|
||||
while (first != last) {
|
||||
*first++ = (int32_t)rand();
|
||||
}
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
void test_copy(const int32_t *source, int32_t *dest, int count, const char *label) {
|
||||
int i;
|
||||
|
||||
fill_random( dest, dest+count );
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(i = 0; i < iterations; ++i) {
|
||||
int i, j, k;
|
||||
for ( i=0, j=0, k=0; k < count; ++i, ++j, ++k ) {
|
||||
dest[i] = source[j];
|
||||
}
|
||||
}
|
||||
|
||||
end_time = clock();
|
||||
|
||||
if ( memcmp(dest, source, count*sizeof(int32_t)) != 0 )
|
||||
printf("test %s failed\n", label);
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
|
||||
printf("\"%s, %d items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
// output command for documentation:
|
||||
int i;
|
||||
// for (i = 0; i < argc; ++i)
|
||||
// printf("%s ", argv[i] );
|
||||
// printf("\n");
|
||||
|
||||
if (argc > 1) iterations = atoi(argv[1]);
|
||||
if (argc > 2) init_value = (int) atoi(argv[2]);
|
||||
|
||||
int32_t intSrc[ SIZE ];
|
||||
int32_t intDst[ SIZE ];
|
||||
|
||||
|
||||
srand( (unsigned int)init_value + 123);
|
||||
fill_random( intSrc, intSrc+SIZE );
|
||||
|
||||
|
||||
test_copy( &intSrc[0], &intDst[0], SIZE, "int32_t for induction copy" );
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// the end
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
14
sdk/software/examples/c_prg/memcmp/Makefile
Normal file
14
sdk/software/examples/c_prg/memcmp/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
TARGET = memcmp
|
||||
|
||||
CFLAGS += -O3 -g
|
||||
|
||||
#根据SIMU宏选择串口波特率,0:FPGA上板;1:仿真
|
||||
CFLAGS += -DSIMU=0
|
||||
|
||||
C_SRCS := $(wildcard ./*.c )
|
||||
|
||||
OBJDIR = obj
|
||||
COMMON_DIR = ../../../bsp
|
||||
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
|
||||
PICOLIBC_DIR=../../../../toolchains/picolibc
|
||||
include ../../../bsp/common.mk
|
||||
177
sdk/software/examples/c_prg/memcmp/memcmp.c
Normal file
177
sdk/software/examples/c_prg/memcmp/memcmp.c
Normal file
@@ -0,0 +1,177 @@
|
||||
/*
|
||||
Copyright 2008-2009 Adobe Systems Incorporated
|
||||
Copyright 2018 Chris Cox
|
||||
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
|
||||
or a copy at http://stlab.adobe.com/licenses.html )
|
||||
|
||||
|
||||
Goal: Test compiler optimizations related to memcmp and hand coded memcmp loops.
|
||||
|
||||
|
||||
Assumptions:
|
||||
|
||||
1) The compiler will recognize memcmp like loops and optimize appropriately.
|
||||
This could be subtitution of calls to memcmp,
|
||||
or it could be just optimizing the loop to get the best throughput.
|
||||
On modern systems, cache hinting is usually required for best throughput.
|
||||
|
||||
2) The library function memcmp should be optimized for small, medium, and large buffers.
|
||||
ie: low overhead for smaller buffer, highly hinted for large buffers.
|
||||
|
||||
3) The STL functions equal and mismatch should be optimized for small, medium, and large buffers.
|
||||
ie: low overhead for smaller buffers, highly hinted for large buffers.
|
||||
|
||||
|
||||
|
||||
|
||||
NOTE - on some OSes, memcmp calls into the VM system to test for shared pages
|
||||
thus running faster than the DRAM bandwidth would allow on large arrays
|
||||
|
||||
However, on those OSes, calling memcmp can hit mutexes and slow down
|
||||
significantly when called from threads.
|
||||
|
||||
|
||||
NOTE - Linux memcmp returns 0, +-1 instead of the actual difference
|
||||
NOTE - and sometimes Linux memcmp returns 0, +-256 instead of the actual difference
|
||||
|
||||
|
||||
TODO - test performance of unaligned buffers
|
||||
*/
|
||||
|
||||
#include <time.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
/******************************************************************************/
|
||||
//BSP板级支持包所需全局变量
|
||||
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
|
||||
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
|
||||
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
|
||||
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
|
||||
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
|
||||
|
||||
clock_t start_time, end_time;
|
||||
|
||||
// this constant may need to be adjusted to give reasonable minimum times
|
||||
// For best results, times should be about 1.0 seconds for the minimum test run
|
||||
int iterations = 1;
|
||||
|
||||
|
||||
// 64 Megabytes, intended to be larger than L2 cache on common CPUs
|
||||
// needs to be divisible by 8
|
||||
// 没有这么大内存,给30KB
|
||||
#define SIZE_4K 4096
|
||||
// #define SIZE_3M 3145728
|
||||
#define SIZE_3M 30720
|
||||
|
||||
// initial value for filling our arrays, may be changed from the command line
|
||||
uint8_t init_value = 3;
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
void fill(uint8_t * first, uint8_t * last, uint8_t value) {
|
||||
while (first != last) *first++ = value;
|
||||
}
|
||||
|
||||
|
||||
int forloop_memcmp( const void *first, const void *second, size_t bytes ){
|
||||
const uint8_t *first_byte = (const uint8_t *)first;
|
||||
const uint8_t *second_byte = (const uint8_t *)second;
|
||||
int x;
|
||||
|
||||
for (x = 0; x < bytes; ++x) {
|
||||
if (first_byte[x] != second_byte[x]) {
|
||||
return (first_byte[x] - second_byte[x]);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
void test_memcmp(const uint8_t *first, const uint8_t *second, int count, bool expected_result) {
|
||||
int i;
|
||||
int bytes = count * sizeof(uint8_t);
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(i = 0; i < iterations; ++i) {
|
||||
// sigh, Linux memcmp is wonky - some return 1, some return 256
|
||||
bool result = (forloop_memcmp( first, second, bytes ) != 0) ;
|
||||
|
||||
// moving this test out of the loop causes unwanted overoptimization
|
||||
if ( result != expected_result )
|
||||
printf("test %s by %d failed (got %d instead of %d)\n", "for loop compare", count, (int)result, (int)expected_result );
|
||||
}
|
||||
|
||||
end_time = clock();
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
void test_memcmp_sizes(const uint8_t *first, const uint8_t *second, int max_count, bool result) {
|
||||
int i = max_count * sizeof(uint8_t);
|
||||
|
||||
test_memcmp( first, second, max_count, result);
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
|
||||
printf("\"%s %d bytes\" compare result: %s %f sec\n",
|
||||
"for loop compare",
|
||||
i,
|
||||
result ? "false" : "true",
|
||||
time_cost);
|
||||
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
// our global arrays of numbers to be operated upon
|
||||
|
||||
uint8_t data8u[SIZE_3M/sizeof(uint8_t)];
|
||||
int alignment_pad = 1024;
|
||||
uint8_t data8u_dest[SIZE_3M/sizeof(uint8_t) + 1024]; // leave some room for alignment testing
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
// output command for documentation:
|
||||
int i;
|
||||
|
||||
if (argc > 1) iterations = atoi(argv[1]);
|
||||
if (argc > 2) init_value = (int32_t) atoi(argv[2]);
|
||||
|
||||
|
||||
fill( data8u, data8u+(SIZE_3M/sizeof(uint8_t)), (uint8_t)(init_value) );
|
||||
fill( data8u_dest, data8u_dest+(SIZE_3M/sizeof(uint8_t) + alignment_pad), (uint8_t)(init_value) );
|
||||
test_memcmp_sizes( data8u, data8u_dest, SIZE_3M/sizeof(uint8_t), false);
|
||||
data8u[(SIZE_3M/sizeof(uint8_t))-1] += 1; // last byte in the array
|
||||
test_memcmp_sizes( data8u, data8u_dest, SIZE_3M/sizeof(uint8_t), true);
|
||||
/*
|
||||
test_memcmp_sizes( data8u, data8u_dest, SIZE_1M/sizeof(uint8_t), false);
|
||||
data8u[(SIZE_1M/sizeof(uint8_t))-1] += 1; // last byte in the array
|
||||
test_memcmp_sizes( data8u, data8u_dest, SIZE_1M/sizeof(uint8_t), true);
|
||||
*/
|
||||
test_memcmp_sizes( data8u, data8u_dest, SIZE_4K/sizeof(uint8_t), false);
|
||||
data8u[(SIZE_4K/sizeof(uint8_t))-1] += 1; // last byte in the array
|
||||
test_memcmp_sizes( data8u, data8u_dest, SIZE_4K/sizeof(uint8_t), true);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// the end
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
14
sdk/software/examples/c_prg/minmax_sequence/Makefile
Normal file
14
sdk/software/examples/c_prg/minmax_sequence/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
TARGET = minmax_sequence
|
||||
|
||||
CFLAGS += -O3 -g
|
||||
|
||||
#根据SIMU宏选择串口波特率,0:FPGA上板;1:仿真
|
||||
CFLAGS += -DSIMU=0
|
||||
|
||||
C_SRCS := $(wildcard ./*.c )
|
||||
|
||||
OBJDIR = obj
|
||||
COMMON_DIR = ../../../bsp
|
||||
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
|
||||
PICOLIBC_DIR=../../../../toolchains/picolibc
|
||||
include ../../../bsp/common.mk
|
||||
1509
sdk/software/examples/c_prg/minmax_sequence/minmax_sequence.c
Normal file
1509
sdk/software/examples/c_prg/minmax_sequence/minmax_sequence.c
Normal file
File diff suppressed because it is too large
Load Diff
14
sdk/software/examples/c_prg/product_sequence/Makefile
Normal file
14
sdk/software/examples/c_prg/product_sequence/Makefile
Normal file
@@ -0,0 +1,14 @@
|
||||
TARGET = product_sequence
|
||||
|
||||
CFLAGS += -O3 -g
|
||||
|
||||
#根据SIMU宏选择串口波特率,0:FPGA上板;1:仿真
|
||||
CFLAGS += -DSIMU=0
|
||||
|
||||
C_SRCS := $(wildcard ./*.c )
|
||||
|
||||
OBJDIR = obj
|
||||
COMMON_DIR = ../../../bsp
|
||||
GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
|
||||
PICOLIBC_DIR=../../../../toolchains/picolibc
|
||||
include ../../../bsp/common.mk
|
||||
169
sdk/software/examples/c_prg/product_sequence/product_sequence.c
Normal file
169
sdk/software/examples/c_prg/product_sequence/product_sequence.c
Normal file
@@ -0,0 +1,169 @@
|
||||
/*
|
||||
Copyright 2008 Adobe Systems Incorporated
|
||||
Copyright 2019 Chris Cox
|
||||
Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
|
||||
or a copy at http://stlab.adobe.com/licenses.html )
|
||||
|
||||
|
||||
Goal: Test performance of various idioms for calculating the product of a sequence.
|
||||
|
||||
|
||||
Assumptions:
|
||||
1) The compiler will optimize product operations.
|
||||
|
||||
2) The compiler may recognize ineffecient product idioms and substitute efficient methods.
|
||||
|
||||
|
||||
*/
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
#include <time.h>
|
||||
#include <stdlib.h>
|
||||
#include <math.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <stdint.h>
|
||||
#include <stdbool.h>
|
||||
|
||||
|
||||
/******************************************************************************/
|
||||
//BSP板级支持包所需全局变量
|
||||
unsigned long UART_BASE = 0xbfe001e0; //UART16550的虚地址
|
||||
unsigned long CONFREG_UART_BASE = 0xbfafff10; //CONFREG模拟UART的虚地址
|
||||
unsigned long CONFREG_TIMER_BASE = 0xbfafe000; //CONFREG计数器的虚地址
|
||||
unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L; //CONFREG时钟频率
|
||||
unsigned long CORE_CLOCKS_PER_SEC = 33000000L; //处理器核时钟频率
|
||||
|
||||
clock_t start_time, end_time;
|
||||
|
||||
/******************************************************************************/
|
||||
|
||||
// this constant may need to be adjusted to give reasonable minimum times
|
||||
// For best results, times should be about 1.0 seconds for the minimum test run
|
||||
int iterations = 10;
|
||||
|
||||
|
||||
// 4000 items, or about 32k of data
|
||||
// this is intended to remain within the L2 cache of most common CPUs
|
||||
const int SIZE = 4000;
|
||||
|
||||
|
||||
// initial value for filling our arrays, may be changed from the command line
|
||||
double init_value = 2.1;
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
void fill_f16(float * first, float * last, float value) {
|
||||
while (first != last) *first++ = (float)(value);
|
||||
}
|
||||
|
||||
void fill_f32(double * first, double * last, double value) {
|
||||
while (first != last) *first++ = (double)(value);
|
||||
}
|
||||
|
||||
|
||||
void testOneFunction_f16(const float* first, const int count, const char * label) {
|
||||
int i;
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(i = 0; i < iterations; ++i) {
|
||||
|
||||
float result = (float)(1);
|
||||
for (int j = 0; j < count; ++j) {
|
||||
result = result * first[j];
|
||||
}
|
||||
|
||||
if ( fabs( result - pow(init_value,(double)SIZE) ) > 1.0e-6 )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
|
||||
printf("\"%s, %d items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
void testOneFunction_f32(const double* first, const int count, const char * label) {
|
||||
int i;
|
||||
|
||||
start_time = clock();
|
||||
|
||||
for(i = 0; i < iterations; ++i) {
|
||||
|
||||
double result = (double)(1);
|
||||
for (int j = 0; j < count; ++j) {
|
||||
result = result * first[j];
|
||||
}
|
||||
|
||||
if ( fabs( result - pow(init_value,(double)SIZE) ) > 1.0e-6 )
|
||||
printf("test %s failed\n", label);
|
||||
}
|
||||
|
||||
// need the labels to remain valid until we print the summary
|
||||
end_time = clock();
|
||||
|
||||
double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
|
||||
|
||||
printf("\"%s, %d items\" %f sec\n",
|
||||
label,
|
||||
count,
|
||||
time_cost);
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
void TestOneType_f16()
|
||||
{
|
||||
|
||||
float data[SIZE];
|
||||
|
||||
fill_f16(data, data+SIZE, (float)(init_value));
|
||||
|
||||
testOneFunction_f16( data, SIZE, "float product sequence1" );
|
||||
|
||||
}
|
||||
|
||||
void TestOneType_f32()
|
||||
{
|
||||
|
||||
double data[SIZE];
|
||||
|
||||
fill_f32(data, data+SIZE, (double)(init_value));
|
||||
|
||||
testOneFunction_f32( data, SIZE, "double product sequence1" );
|
||||
|
||||
}
|
||||
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
|
||||
// output command for documentation:
|
||||
int i;
|
||||
// for (i = 0; i < argc; ++i)
|
||||
// printf("%s ", argv[i] );
|
||||
// printf("\n");
|
||||
|
||||
if (argc > 1) iterations = atoi(argv[1]);
|
||||
if (argc > 2) init_value = (double) atof(argv[2]);
|
||||
|
||||
|
||||
TestOneType_f16();
|
||||
|
||||
TestOneType_f32();
|
||||
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
// the end
|
||||
/******************************************************************************/
|
||||
/******************************************************************************/
|
||||
Reference in New Issue
Block a user