initial commit

2026-04-12 22:20:18 +08:00
commit 190c2edbb2
155 changed files with 36314 additions and 0 deletions
--- a/sdk/software/examples/c_prg/inner_product/Makefile
+++ b/sdk/software/examples/c_prg/inner_product/Makefile
@@ -0,0 +1,14 @@
+TARGET = inner_product
+
+CFLAGS += -O3 -g
+
+#根据SIMU宏选择串口波特率,0：FPGA上板；1：仿真
+CFLAGS += -DSIMU=0
+
+C_SRCS := $(wildcard ./*.c )
+
+OBJDIR = obj
+COMMON_DIR = ../../../bsp
+GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
+PICOLIBC_DIR=../../../../toolchains/picolibc
+include ../../../bsp/common.mk
--- a/sdk/software/examples/c_prg/inner_product/inner_product.c
+++ b/sdk/software/examples/c_prg/inner_product/inner_product.c
@@ -0,0 +1,446 @@
+/*
+    Copyright 2008 Adobe Systems Incorporated
+    Copyright 2018-2019 Chris Cox
+    Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
+    or a copy at http://stlab.adobe.com/licenses.html )
+
+
+Goal:  Test performance of various idioms for calculating the inner product of two sequences.
+
+NOTE:  Inner products are common in mathematical and geometry processing applications,
+        plus some audio and image processing.
+
+
+Assumptions:
+    1) The compiler will optimize inner product operations.
+
+    2) The compiler may recognize ineffecient inner product idioms
+        and substitute efficient methods when it can.
+        NOTE: the best method is highly dependent on the data types and CPU architecture
+
+    3) std::inner_product will be well optimized for all types and containers.
+
+
+*/
+
+/******************************************************************************/
+
+#include <time.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+/******************************************************************************/
+/******************************************************************************/
+//BSP板级支持包所需全局变量
+unsigned long UART_BASE = 0xbfe001e0;					//UART16550的虚地址
+unsigned long CONFREG_UART_BASE = 0xbfafff10;			//CONFREG模拟UART的虚地址
+unsigned long CONFREG_TIMER_BASE = 0xbfafe000;			//CONFREG计数器的虚地址
+unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L;		//CONFREG时钟频率
+unsigned long CORE_CLOCKS_PER_SEC = 33000000L;			//处理器核时钟频率
+
+clock_t start_time, end_time;
+
+// this constant may need to be adjusted to give reasonable minimum times
+// For best results, times should be about 1.0 seconds for the minimum test run
+int iterations = 5;
+
+
+// 8000 items, or between 8 and 64k of data
+// this is intended to remain within the L2 cache of most common CPUs
+const int SIZE = 8000;
+
+
+// initial value for filling our arrays, may be changed from the command line
+int32_t init_value_8 = 3;
+int32_t init_value_16 = 211;
+int32_t init_value_32 = 1065;
+float init_value_f16 = 5.0;
+double init_value_f32 = 365.0;
+
+/******************************************************************************/
+/******************************************************************************/
+
+void fill_8(int8_t * first, int8_t * last, int8_t value) {
+    while (first != last) *first++ = (int8_t)(value);
+}
+
+void fill_u8(uint8_t * first, uint8_t * last, uint8_t value) {
+    while (first != last) *first++ = (uint8_t)(value);
+}
+
+void fill_16(int16_t * first, int16_t * last, int16_t value) {
+    while (first != last) *first++ = (int16_t)(value);
+}
+
+void fill_u16(uint16_t * first, uint16_t * last, uint16_t value) {
+    while (first != last) *first++ = (uint16_t)(value);
+}
+
+void fill_32(int32_t * first, int32_t * last, int32_t value) {
+    while (first != last) *first++ = (int32_t)(value);
+}
+
+void fill_u32(uint32_t * first, uint32_t * last, uint32_t value) {
+    while (first != last) *first++ = (uint32_t)(value);
+}
+
+void fill_f16(float * first, float * last, float value) {
+    while (first != last) *first++ = (float)(value);
+}
+
+void fill_f32(double * first, double * last, double value) {
+    while (first != last) *first++ = (double)(value);
+}
+
+/******************************************************************************/
+/******************************************************************************/
+// a trivial for loop
+
+void test_inner_product_8( const int8_t* first, const int8_t* second, const size_t count, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+
+        int8_t sum = 0 ;
+        for (size_t j = 0; j < count; ++j) {
+            sum += first[j] * second[j];
+        }
+        
+        //check_sum( sum, label );
+        int8_t target = (int8_t)(init_value_8)*(int8_t)(init_value_8)*SIZE;
+        if ( abs( sum - target ) > (int8_t)(1.0e-6) )
+            printf("test %s failed\n", label);
+    }
+    
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("CLOCKS_PER_SEC=%d\n",CLOCKS_PER_SEC);
+    printf("\"%s, %lu items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+void test_inner_product_u8( const uint8_t* first, const uint8_t* second, const size_t count, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+
+        uint8_t sum = 0 ;
+        for (size_t j = 0; j < count; ++j) {
+            sum += first[j] * second[j];
+        }
+        
+        //check_sum( sum, label );
+        uint8_t target = (uint8_t)(init_value_8)*(uint8_t)(init_value_8)*SIZE;
+        if ( ( sum - target ) > (uint8_t)(1.0e-6) )
+            printf("test %s failed\n", label);
+    }
+    
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %lu items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+void test_inner_product_16( const int16_t* first, const int16_t* second, const size_t count, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+
+        int16_t sum = 0 ;
+        for (size_t j = 0; j < count; ++j) {
+            sum += first[j] * second[j];
+        }
+        
+        //check_sum( sum, label );
+        int16_t target = (int16_t)(init_value_16)*(int16_t)(init_value_16)*SIZE;
+        if ( abs( sum - target ) > (int16_t)(1.0e-6) )
+            printf("test %s failed\n", label);
+    }
+    
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %lu items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+void test_inner_product_u16( const uint16_t* first, const uint16_t* second, const size_t count, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+
+        uint16_t sum = 0 ;
+        for (size_t j = 0; j < count; ++j) {
+            sum += first[j] * second[j];
+        }
+        
+        //check_sum( sum, label );
+        uint16_t target = (uint16_t)(init_value_16)*(uint16_t)(init_value_16)*SIZE;
+        if ( ( sum - target ) > (uint16_t)(1.0e-6) )
+            printf("test %s failed\n", label);
+    }
+    
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %lu items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+void test_inner_product_32( const int32_t* first, const int32_t* second, const size_t count, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+
+        int32_t sum = 0 ;
+        for (size_t j = 0; j < count; ++j) {
+            sum += first[j] * second[j];
+        }
+        
+        //check_sum( sum, label );
+        int32_t target = (int32_t)(init_value_32)*(int32_t)(init_value_32)*SIZE;
+        if ( abs( sum - target ) > (int32_t)(1.0e-6) )
+            printf("test %s failed\n", label);
+    }
+    
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %lu items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+void test_inner_product_u32( const uint32_t* first, const uint32_t* second, const size_t count, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+
+        uint32_t sum = 0 ;
+        for (size_t j = 0; j < count; ++j) {
+            sum += first[j] * second[j];
+        }
+        
+        //check_sum( sum, label );
+        uint32_t target = (uint32_t)(init_value_32)*(uint32_t)(init_value_32)*SIZE;
+        if ( ( sum - target ) > (uint32_t)(1.0e-6) )
+            printf("test %s failed\n", label);
+    }
+    
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %lu items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+void test_inner_product_f16( const float* first, const float* second, const size_t count, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+
+        float sum = 0 ;
+        for (size_t j = 0; j < count; ++j) {
+            sum += first[j] * second[j];
+        }
+        
+        //check_sum( sum, label );
+        float target = (float)(init_value_f16)*(float)(init_value_f16)*SIZE;
+        if ( fabs( sum - target ) > (float)(1.0e-6) )
+            printf("test %s failed\n", label);
+    }
+    
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %lu items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+void test_inner_product_f32( const double* first, const double* second, const size_t count, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+
+        double sum = 0 ;
+        for (size_t j = 0; j < count; ++j) {
+            sum += first[j] * second[j];
+        }
+        
+        //check_sum( sum, label );
+        double target = (double)(init_value_f32)*(double)(init_value_f32)*SIZE;
+        if ( fabs( sum - target ) > (double)(1.0e-6) )
+            printf("test %s failed\n", label);
+    }
+    
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %lu items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+/******************************************************************************/
+/******************************************************************************/
+
+// NOTE - can't make generic template template argument without C++17
+// I would like to have TestOneFunction to handle all the types and if's, but need to use different types with it inside
+// see sum_sequence.cpp
+
+
+void TestOneType_8()
+{
+    int8_t data[SIZE];
+    int8_t dataB[SIZE];
+
+    fill_8(data, data+SIZE, (int8_t)(init_value_8));
+    fill_8(dataB, dataB+SIZE, (int8_t)(init_value_8));
+   
+    test_inner_product_8( data, dataB, SIZE, "int_8 inner_product1 to int_8");
+}
+
+void TestOneType_u8()
+{
+    uint8_t data[SIZE];
+    uint8_t dataB[SIZE];
+
+    fill_u8(data, data+SIZE, (uint8_t)(init_value_8));
+    fill_u8(dataB, dataB+SIZE, (uint8_t)(init_value_8));
+   
+    test_inner_product_u8( data, dataB, SIZE, "uint_8 inner_product1 to uint_8");
+}
+
+
+void TestOneType_16()
+{
+    int16_t data[SIZE];
+    int16_t dataB[SIZE];
+
+    fill_16(data, data+SIZE, (int16_t)(init_value_16));
+    fill_16(dataB, dataB+SIZE, (int16_t)(init_value_16));
+   
+    test_inner_product_16( data, dataB, SIZE, "int_16 inner_product1 to int_16");
+}
+
+void TestOneType_u16()
+{
+    uint16_t data[SIZE];
+    uint16_t dataB[SIZE];
+
+    fill_u16(data, data+SIZE, (uint16_t)(init_value_16));
+    fill_u16(dataB, dataB+SIZE, (uint16_t)(init_value_16));
+   
+    test_inner_product_u16( data, dataB, SIZE, "uint_16 inner_product1 to uint_16");
+}
+
+void TestOneType_32()
+{
+    int32_t data[SIZE];
+    int32_t dataB[SIZE];
+
+    fill_32(data, data+SIZE, (int32_t)(init_value_32));
+    fill_32(dataB, dataB+SIZE, (int32_t)(init_value_32));
+   
+    test_inner_product_32( data, dataB, SIZE, "int_32 inner_product1 to int_32");
+}
+
+void TestOneType_u32()
+{
+    uint32_t data[SIZE];
+    uint32_t dataB[SIZE];
+
+    fill_u32(data, data+SIZE, (uint32_t)(init_value_32));
+    fill_u32(dataB, dataB+SIZE, (uint32_t)(init_value_32));
+   
+    test_inner_product_u32( data, dataB, SIZE, "uint_32 inner_product1 to uint_32");
+}
+
+void TestOneType_f16()
+{
+    float data[SIZE];
+    float dataB[SIZE];
+
+    fill_f16(data, data+SIZE, (float)(init_value_f16));
+    fill_f16(dataB, dataB+SIZE, (float)(init_value_f16));
+   
+    test_inner_product_f16( data, dataB, SIZE, "float inner_product1 to float");
+}
+
+void TestOneType_f32()
+{
+    double data[SIZE];
+    double dataB[SIZE];
+
+    fill_f32(data, data+SIZE, (double)(init_value_f32));
+    fill_f32(dataB, dataB+SIZE, (double)(init_value_f32));
+   
+    test_inner_product_f32( data, dataB, SIZE, "double inner_product1 to double");
+}
+/******************************************************************************/
+/******************************************************************************/
+
+int main(int argc, char** argv) {
+
+    // output command for documentation:
+    int i;
+    // for (i = 0; i < argc; ++i)
+    //     printf("%s ", argv[i] );
+    // printf("\n");
+
+    if (argc > 1) iterations = atoi(argv[1]);
+    // if (argc > 2) init_value = (int32_t) atoi(argv[2]);
+
+
+    TestOneType_8();
+    TestOneType_u8();
+    TestOneType_16();
+    TestOneType_u16();
+    TestOneType_32();
+    TestOneType_u32();
+
+    TestOneType_f16();
+    TestOneType_f32();
+
+    return 0;
+}
+
+// the end
+/******************************************************************************/
+/******************************************************************************/
--- a/sdk/software/examples/c_prg/lookup_table/Makefile
+++ b/sdk/software/examples/c_prg/lookup_table/Makefile
@@ -0,0 +1,14 @@
+TARGET = lookup_table
+
+CFLAGS += -O3 -g
+
+#根据SIMU宏选择串口波特率,0：FPGA上板；1：仿真
+CFLAGS += -DSIMU=0
+
+C_SRCS := $(wildcard ./*.c )
+
+OBJDIR = obj
+COMMON_DIR = ../../../bsp
+GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
+PICOLIBC_DIR=../../../../toolchains/picolibc
+include ../../../bsp/common.mk
--- a/sdk/software/examples/c_prg/lookup_table/lookup_table.c
+++ b/sdk/software/examples/c_prg/lookup_table/lookup_table.c
@@ -0,0 +1,315 @@
+/*
+    Copyright 2008-2009 Adobe Systems Incorporated
+    Copyright 2018 Chris Cox
+    Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
+    or a copy at http://stlab.adobe.com/licenses.html )
+
+
+Goal: Test performance of various idioms and optimizations for lookup tables.
+
+
+Assumptions:
+    1) The compiler will optimize lookup table operations.
+        Unrolling will usually be needed to hide read latencies.
+
+    2) The compiler should recognize ineffecient lookup table idioms and substitute efficient methods.
+        Many different CPU architecture issues will require reading and writing words for best performance.
+            CPUs with...
+                    cache write-back/write-combine delays.
+                    store forwarding delays.
+                    slow cache access relative to shifts/masks.
+                    slow partial word (byte) access.
+                    fast shift/mask operations.
+        On some CPUs, a lookup can be handled with vector instructions.
+        On some CPUs, special cache handling is needed (especially 2way caches).
+
+
+
+
+TODO - lookup and interpolate (int16_t, int32_t, int64_t, float, double)
+TODO - 2D and 3D LUTs, simple and interpolated
+
+*/
+
+/******************************************************************************/
+
+#include <time.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+/******************************************************************************/
+/******************************************************************************/
+//BSP板级支持包所需全局变量
+unsigned long UART_BASE = 0xbfe001e0;					//UART16550的虚地址
+unsigned long CONFREG_UART_BASE = 0xbfafff10;			//CONFREG模拟UART的虚地址
+unsigned long CONFREG_TIMER_BASE = 0xbfafe000;			//CONFREG计数器的虚地址
+unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L;		//CONFREG时钟频率
+unsigned long CORE_CLOCKS_PER_SEC = 33000000L;			//处理器核时钟频率
+
+clock_t start_time, end_time;
+
+// this constant may need to be adjusted to give reasonable minimum times
+// For best results, times should be about 1.0 seconds for the minimum test run
+int base_iterations = 1;
+int iterations = 1;
+
+// 4000 items, or about 2..4k of data
+// this is intended to remain within the L1 cache of most common CPUs
+#define SIZE_SMALL 2000
+
+// about 0.5..1M of data
+// 没有那么大内存，这里给50KB到100KB
+// this is intended to be outside the L2 cache of most common CPUs
+#define SIZE 50000
+
+// initial value for filling our arrays, may be changed from the command line
+int32_t init_value = 3;
+
+/******************************************************************************/
+
+// our global arrays of numbers
+
+uint8_t inputData8[SIZE];
+uint8_t resultData8[SIZE];
+
+uint16_t inputData16[SIZE];
+uint16_t resultData16[SIZE];
+
+/******************************************************************************/
+/******************************************************************************/
+
+
+void fill_8(uint8_t * first, uint8_t * last, uint8_t value) {
+    while (first != last) *first++ = (uint8_t)(value);
+}
+
+void fill_16(uint16_t * first, uint16_t * last, uint16_t value) {
+    while (first != last) *first++ = (uint16_t)(value);
+}
+
+void fill_random_8(uint8_t * first, uint8_t * last) {
+    srand((unsigned int)init_value + 123 );
+    while (first != last) {
+        *first++ = (uint8_t)rand();
+    }
+}
+
+void fill_random_16(uint16_t * first, uint16_t * last) {
+    srand((unsigned int)init_value + 123 );
+    while (first != last) {
+        *first++ = (uint16_t)rand();
+    }
+}
+
+int max(int a, int b){
+    if(a > b)
+        return a;
+    else
+        return b;
+}
+
+/******************************************************************************/
+/******************************************************************************/
+
+
+
+// baseline - a trivial loop
+
+void test_lut1_u8(const uint8_t* input, uint8_t *result, const int count, const uint8_t* LUT, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+        for (int j = 0; j < count; ++j) {
+            result[j] = LUT[ input[j] ];
+        }
+    }
+    
+    end_time = clock();
+
+    int j;
+
+    for (j = 0; j < count; ++j) {
+        if (result[j] != (uint8_t)(init_value)) {
+            printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
+            break;
+        }
+    }
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %d times\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+
+}
+
+void test_lut1_8(const int8_t* input, int8_t *result, const int count, const int8_t* LUT, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+        for (int j = 0; j < count; ++j) {
+            result[j] = LUT[ input[j] ];
+        }
+    }
+    
+    end_time = clock();
+
+    int j;
+
+    for (j = 0; j < count; ++j) {
+        if (result[j] != (int8_t)(init_value)) {
+            printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
+            break;
+        }
+    }
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %d times\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+
+}
+
+void test_lut1_u16(const uint16_t* input, uint16_t *result, const int count, const uint16_t* LUT, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+        for (int j = 0; j < count; ++j) {
+            result[j] = LUT[ input[j] ];
+        }
+    }
+    
+    end_time = clock();
+
+    int j;
+
+    for (j = 0; j < count; ++j) {
+        if (result[j] != (uint16_t)(init_value)) {
+            printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
+            break;
+        }
+    }
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %d times\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+
+}
+
+void test_lut1_16(const int16_t* input, int16_t *result, const int count, const int16_t* LUT, const char *label) {
+
+    start_time = clock();
+
+    for(int i = 0; i < iterations; ++i) {
+        for (int j = 0; j < count; ++j) {
+            result[j] = LUT[ input[j] ];
+        }
+    }
+    
+    end_time = clock();
+
+    int j;
+
+    for (j = 0; j < count; ++j) {
+        if (result[j] != (int16_t)(init_value)) {
+            printf("test %s failed (got %u, expected %u)\n", label, (unsigned)(result[j]), (unsigned)(init_value));
+            break;
+        }
+    }
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+    printf("\"%s, %d times\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+
+}
+
+/******************************************************************************/
+/******************************************************************************/
+
+int main(int argc, char** argv) {
+
+    // output command for documentation:
+    int i;
+    // for (i = 0; i < argc; ++i)
+    //     printf("%s ", argv[i] );
+    // printf("\n");
+
+    if (argc > 1) base_iterations = atoi(argv[1]);
+    if (argc > 2) init_value = (int32_t) atoi(argv[2]);
+
+    uint8_t myLUT8[ 256 ];
+    uint16_t myLUT16[ 65536 ];
+    
+
+    fill_8(myLUT8, myLUT8+256, (uint8_t)(init_value));
+    fill_16(myLUT16, myLUT16+65536, (uint16_t)(init_value));
+
+    fill_random_8( inputData8, inputData8+SIZE );
+    fill_random_16( inputData16, inputData16+SIZE );
+
+
+// uint8_t
+    iterations = base_iterations;
+
+    test_lut1_u8( inputData8, inputData8, SIZE_SMALL, myLUT8, "uint8_t lookup table1 small inplace");
+    test_lut1_u8( inputData8, resultData8, SIZE_SMALL, myLUT8, "uint8_t lookup table1 small");
+
+    iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
+    
+    test_lut1_u8( inputData8, inputData8, SIZE, myLUT8, "uint8_t lookup table1 large inplace");
+    test_lut1_u8( inputData8, resultData8, SIZE, myLUT8, "uint8_t lookup table1 large");
+
+
+
+// int8_t
+    iterations = base_iterations;
+
+    test_lut1_8( (int8_t*)inputData8, (int8_t*)inputData8, SIZE_SMALL, (int8_t*)(myLUT8+128), "int8_t lookup table1 small inplace");  
+    test_lut1_8( (int8_t*)inputData8, (int8_t*)resultData8, SIZE_SMALL, (int8_t*)(myLUT8+128), "int8_t lookup table1 small"); 
+
+    iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
+    
+    test_lut1_8( (int8_t*)inputData8, (int8_t*)inputData8, SIZE, (int8_t*)(myLUT8+128), "int8_t lookup table1 large inplace");
+    test_lut1_8( (int8_t*)inputData8, (int8_t*)resultData8, SIZE, (int8_t*)(myLUT8+128), "int8_t lookup table1 large");
+
+    
+// uint16_t
+    iterations = base_iterations;
+
+    test_lut1_u16( inputData16, inputData16, SIZE_SMALL, myLUT16, "uint16_t lookup table1 small inplace");
+    test_lut1_u16( inputData16, resultData16, SIZE_SMALL, myLUT16, "uint16_t lookup table1 small");
+
+    iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
+    
+    test_lut1_u16( inputData16, inputData16, SIZE, myLUT16, "uint16_t lookup table1 large inplace");
+    test_lut1_u16( inputData16, resultData16, SIZE, myLUT16, "uint16_t lookup table1 large");
+
+// int16_t
+    iterations = base_iterations;
+
+    test_lut1_16( (int16_t*)inputData16, (int16_t*)inputData16, SIZE_SMALL, (int16_t*)(myLUT16+32768), "int16_t lookup table1 small inplace");
+    test_lut1_16( (int16_t*)inputData16, (int16_t*)resultData16, SIZE_SMALL, (int16_t*)(myLUT16+32768), "int16_t lookup table1 small");
+
+    iterations = max( 1, (int)(((uint64_t)base_iterations * SIZE_SMALL) / SIZE) );
+    
+    test_lut1_16( (int16_t*)inputData16, (int16_t*)inputData16, SIZE, (int16_t*)(myLUT16+32768), "int16_t lookup table1 large inplace");
+    test_lut1_16( (int16_t*)inputData16, (int16_t*)resultData16, SIZE, (int16_t*)(myLUT16+32768), "int16_t lookup table1 large");
+
+    return 0;
+}
+
+// the end
+/******************************************************************************/
+/******************************************************************************/
--- a/sdk/software/examples/c_prg/loop_induction/Makefile
+++ b/sdk/software/examples/c_prg/loop_induction/Makefile
@@ -0,0 +1,14 @@
+TARGET = loop_induction
+
+CFLAGS += -O3 -g
+
+#根据SIMU宏选择串口波特率,0：FPGA上板；1：仿真
+CFLAGS += -DSIMU=0
+
+C_SRCS := $(wildcard ./*.c )
+
+OBJDIR = obj
+COMMON_DIR = ../../../bsp
+GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
+PICOLIBC_DIR=../../../../toolchains/picolibc
+include ../../../bsp/common.mk
--- a/sdk/software/examples/c_prg/loop_induction/loop_induction.c
+++ b/sdk/software/examples/c_prg/loop_induction/loop_induction.c
@@ -0,0 +1,131 @@
+/*
+    Copyright 2018 Chris Cox
+    Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
+    or a copy at http://stlab.adobe.com/licenses.html )
+
+
+Goal:  Examine performance optimizations related to loop induction variables.
+
+
+Assumptions:
+    1) The compiler will normalize all loop types and optimize all equally.
+        (this is a necessary step before doing induction variable analysis)
+        
+    2) The compiler will remove unused induction variables.
+        This could happen due to several optimizations.
+
+    2) The compiler will recognize induction variables with linear relations (x = a*b + c)
+        and optimize out redundant variables.
+
+    3) The compiler will apply strength reduction to induction variable usage.
+
+    4) The compiler will remove bounds checks by recognizing or adjusting loop limits.
+        (can be an explict loop optimization, or part of range propagation)
+
+
+*/
+
+#include <time.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+
+/******************************************************************************/
+//BSP板级支持包所需全局变量
+unsigned long UART_BASE = 0xbfe001e0;					//UART16550的虚地址
+unsigned long CONFREG_UART_BASE = 0xbfafff10;			//CONFREG模拟UART的虚地址
+unsigned long CONFREG_TIMER_BASE = 0xbfafe000;			//CONFREG计数器的虚地址
+unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L;		//CONFREG时钟频率
+unsigned long CORE_CLOCKS_PER_SEC = 33000000L;			//处理器核时钟频率
+
+clock_t start_time, end_time;
+
+/******************************************************************************/
+
+// this constant may need to be adjusted to give reasonable minimum times
+// For best results, times should be about 1.0 seconds for the minimum test run
+int iterations = 10;
+
+
+// 32000 items, or about 128k of data
+// this is intended to remain within the L2 cache of most common CPUs
+const int SIZE = 32000;
+
+
+// initial value for filling our arrays, may be changed from the command line
+int init_value = 3;
+
+/******************************************************************************/
+
+void fill_random(int32_t * first, int32_t * last) {
+    while (first != last) {
+        *first++ = (int32_t)rand();
+    }
+}
+
+/******************************************************************************/
+/******************************************************************************/
+
+
+void test_copy(const int32_t *source, int32_t *dest, int count, const char *label) {
+    int i;
+    
+    fill_random( dest, dest+count );
+
+    start_time = clock();
+
+    for(i = 0; i < iterations; ++i) {
+        int i, j, k;
+        for ( i=0, j=0, k=0; k < count; ++i, ++j, ++k ) {
+            dest[i] = source[j];
+        }
+    }
+    
+    end_time = clock();
+    
+    if ( memcmp(dest, source, count*sizeof(int32_t)) != 0 )
+        printf("test %s failed\n", label);
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+
+    printf("\"%s, %d items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+/******************************************************************************/
+/******************************************************************************/
+
+int main(int argc, char** argv) {
+
+    // output command for documentation:
+    int i;
+    // for (i = 0; i < argc; ++i)
+    //     printf("%s ", argv[i] );
+    // printf("\n");
+
+    if (argc > 1) iterations = atoi(argv[1]);
+    if (argc > 2) init_value = (int) atoi(argv[2]);
+    
+    int32_t intSrc[ SIZE ];
+    int32_t intDst[ SIZE ];
+    
+    
+    srand( (unsigned int)init_value + 123);
+    fill_random( intSrc, intSrc+SIZE );
+
+
+    test_copy( &intSrc[0], &intDst[0], SIZE, "int32_t for induction copy" );
+
+
+    return 0;
+}
+
+// the end
+/******************************************************************************/
+/******************************************************************************/
--- a/sdk/software/examples/c_prg/memcmp/Makefile
+++ b/sdk/software/examples/c_prg/memcmp/Makefile
@@ -0,0 +1,14 @@
+TARGET = memcmp
+
+CFLAGS += -O3 -g
+
+#根据SIMU宏选择串口波特率,0：FPGA上板；1：仿真
+CFLAGS += -DSIMU=0
+
+C_SRCS := $(wildcard ./*.c )
+
+OBJDIR = obj
+COMMON_DIR = ../../../bsp
+GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
+PICOLIBC_DIR=../../../../toolchains/picolibc
+include ../../../bsp/common.mk
--- a/sdk/software/examples/c_prg/memcmp/memcmp.c
+++ b/sdk/software/examples/c_prg/memcmp/memcmp.c
@@ -0,0 +1,177 @@
+/*
+    Copyright 2008-2009 Adobe Systems Incorporated
+    Copyright 2018 Chris Cox
+    Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
+    or a copy at http://stlab.adobe.com/licenses.html )
+
+
+Goal:  Test compiler optimizations related to memcmp and hand coded memcmp loops.
+
+
+Assumptions:
+
+    1) The compiler will recognize memcmp like loops and optimize appropriately.
+        This could be subtitution of calls to memcmp,
+         or it could be just optimizing the loop to get the best throughput.
+        On modern systems, cache hinting is usually required for best throughput.
+
+    2) The library function memcmp should be optimized for small, medium, and large buffers.
+        ie: low overhead for smaller buffer, highly hinted for large buffers.
+
+    3) The STL functions equal and mismatch should be optimized for small, medium, and large buffers.
+        ie: low overhead for smaller buffers, highly hinted for large buffers.
+
+
+
+
+NOTE - on some OSes, memcmp calls into the VM system to test for shared pages
+        thus running faster than the DRAM bandwidth would allow on large arrays
+        
+        However, on those OSes, calling memcmp can hit mutexes and slow down
+        significantly when called from threads.
+
+
+NOTE - Linux memcmp returns 0, +-1 instead of the actual difference
+NOTE - and sometimes Linux memcmp returns 0, +-256 instead of the actual difference
+
+
+TODO - test performance of unaligned buffers
+*/
+
+#include <time.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+/******************************************************************************/
+//BSP板级支持包所需全局变量
+unsigned long UART_BASE = 0xbfe001e0;					//UART16550的虚地址
+unsigned long CONFREG_UART_BASE = 0xbfafff10;			//CONFREG模拟UART的虚地址
+unsigned long CONFREG_TIMER_BASE = 0xbfafe000;			//CONFREG计数器的虚地址
+unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L;		//CONFREG时钟频率
+unsigned long CORE_CLOCKS_PER_SEC = 33000000L;			//处理器核时钟频率
+
+clock_t start_time, end_time;
+
+// this constant may need to be adjusted to give reasonable minimum times
+// For best results, times should be about 1.0 seconds for the minimum test run
+int iterations = 1;
+
+
+// 64 Megabytes, intended to be larger than L2 cache on common CPUs
+// needs to be divisible by 8
+// 没有这么大内存，给30KB
+#define SIZE_4K  4096
+// #define SIZE_3M  3145728
+#define SIZE_3M  30720
+
+// initial value for filling our arrays, may be changed from the command line
+uint8_t init_value = 3;
+
+/******************************************************************************/
+/******************************************************************************/
+
+void fill(uint8_t * first, uint8_t * last, uint8_t value) {
+    while (first != last) *first++ = value;
+}
+
+
+int forloop_memcmp( const void *first, const void *second, size_t bytes ){
+    const uint8_t *first_byte = (const uint8_t *)first;
+    const uint8_t *second_byte = (const uint8_t *)second;
+    int x;
+        
+    for (x = 0; x < bytes; ++x) {
+        if (first_byte[x] != second_byte[x]) {
+            return (first_byte[x] - second_byte[x]);
+        }
+    }
+        
+    return 0;
+}
+
+
+/******************************************************************************/
+/******************************************************************************/
+
+
+void test_memcmp(const uint8_t *first, const uint8_t *second, int count, bool expected_result) {
+    int i;
+    int bytes = count * sizeof(uint8_t);
+
+    start_time = clock();
+
+    for(i = 0; i < iterations; ++i) {
+        // sigh, Linux memcmp is wonky - some return 1, some return 256
+        bool result = (forloop_memcmp( first, second, bytes ) != 0) ;
+        
+        // moving this test out of the loop causes unwanted overoptimization
+        if ( result != expected_result )
+            printf("test %s by %d failed (got %d instead of %d)\n", "for loop compare", count, (int)result, (int)expected_result );
+    }
+    
+    end_time = clock();
+}
+
+/******************************************************************************/
+
+void test_memcmp_sizes(const uint8_t *first, const uint8_t *second, int max_count, bool result) {
+    int i = max_count * sizeof(uint8_t);
+
+    test_memcmp( first, second, max_count, result);
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+        
+    printf("\"%s %d bytes\"  compare result: %s  %f sec\n",
+            "for loop compare",
+            i,
+            result ? "false" : "true",
+            time_cost);
+    
+}
+
+/******************************************************************************/
+/******************************************************************************/
+
+// our global arrays of numbers to be operated upon
+
+uint8_t data8u[SIZE_3M/sizeof(uint8_t)];
+int alignment_pad = 1024;
+uint8_t data8u_dest[SIZE_3M/sizeof(uint8_t) + 1024]; // leave some room for alignment testing
+
+/******************************************************************************/
+/******************************************************************************/
+
+
+int main(int argc, char** argv) {
+    
+    // output command for documentation:
+    int i;
+
+    if (argc > 1) iterations = atoi(argv[1]);
+    if (argc > 2) init_value = (int32_t) atoi(argv[2]);
+
+
+    fill( data8u, data8u+(SIZE_3M/sizeof(uint8_t)), (uint8_t)(init_value) );
+    fill( data8u_dest, data8u_dest+(SIZE_3M/sizeof(uint8_t) + alignment_pad), (uint8_t)(init_value) );
+    test_memcmp_sizes( data8u, data8u_dest, SIZE_3M/sizeof(uint8_t), false); 
+    data8u[(SIZE_3M/sizeof(uint8_t))-1] += 1;    // last byte in the array 
+    test_memcmp_sizes( data8u, data8u_dest, SIZE_3M/sizeof(uint8_t), true);
+/*
+    test_memcmp_sizes( data8u, data8u_dest, SIZE_1M/sizeof(uint8_t), false);
+    data8u[(SIZE_1M/sizeof(uint8_t))-1] += 1;    // last byte in the array
+    test_memcmp_sizes( data8u, data8u_dest, SIZE_1M/sizeof(uint8_t), true);
+*/
+    test_memcmp_sizes( data8u, data8u_dest, SIZE_4K/sizeof(uint8_t), false);
+    data8u[(SIZE_4K/sizeof(uint8_t))-1] += 1;    // last byte in the array
+    test_memcmp_sizes( data8u, data8u_dest, SIZE_4K/sizeof(uint8_t), true);
+
+    return 0;
+}
+
+// the end
+/******************************************************************************/
+/******************************************************************************/
--- a/sdk/software/examples/c_prg/minmax_sequence/Makefile
+++ b/sdk/software/examples/c_prg/minmax_sequence/Makefile
@@ -0,0 +1,14 @@
+TARGET = minmax_sequence
+
+CFLAGS += -O3 -g
+
+#根据SIMU宏选择串口波特率,0：FPGA上板；1：仿真
+CFLAGS += -DSIMU=0
+
+C_SRCS := $(wildcard ./*.c )
+
+OBJDIR = obj
+COMMON_DIR = ../../../bsp
+GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
+PICOLIBC_DIR=../../../../toolchains/picolibc
+include ../../../bsp/common.mk
--- a/sdk/software/examples/c_prg/minmax_sequence/minmax_sequence.c
+++ b/sdk/software/examples/c_prg/minmax_sequence/minmax_sequence.c
--- a/sdk/software/examples/c_prg/product_sequence/Makefile
+++ b/sdk/software/examples/c_prg/product_sequence/Makefile
@@ -0,0 +1,14 @@
+TARGET = product_sequence
+
+CFLAGS += -O3 -g
+
+#根据SIMU宏选择串口波特率,0：FPGA上板；1：仿真
+CFLAGS += -DSIMU=0
+
+C_SRCS := $(wildcard ./*.c )
+
+OBJDIR = obj
+COMMON_DIR = ../../../bsp
+GCC_DIR=../../../../toolchains/loongson-gnu-toolchain-8.3-x86_64-loongarch32r-linux-gnusf-v2.0
+PICOLIBC_DIR=../../../../toolchains/picolibc
+include ../../../bsp/common.mk
--- a/sdk/software/examples/c_prg/product_sequence/product_sequence.c
+++ b/sdk/software/examples/c_prg/product_sequence/product_sequence.c
@@ -0,0 +1,169 @@
+/*
+    Copyright 2008 Adobe Systems Incorporated
+    Copyright 2019 Chris Cox
+    Distributed under the MIT License (see accompanying file LICENSE_1_0_0.txt
+    or a copy at http://stlab.adobe.com/licenses.html )
+
+
+Goal: Test performance of various idioms for calculating the product of a sequence.
+
+
+Assumptions:
+    1) The compiler will optimize product operations.
+    
+    2) The compiler may recognize ineffecient product idioms and substitute efficient methods.
+
+
+*/
+
+/******************************************************************************/
+
+#include <time.h>
+#include <stdlib.h>
+#include <math.h>
+#include <stdio.h>
+#include <string.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+
+/******************************************************************************/
+//BSP板级支持包所需全局变量
+unsigned long UART_BASE = 0xbfe001e0;					//UART16550的虚地址
+unsigned long CONFREG_UART_BASE = 0xbfafff10;			//CONFREG模拟UART的虚地址
+unsigned long CONFREG_TIMER_BASE = 0xbfafe000;			//CONFREG计数器的虚地址
+unsigned long CONFREG_CLOCKS_PER_SEC = 100000000L;		//CONFREG时钟频率
+unsigned long CORE_CLOCKS_PER_SEC = 33000000L;			//处理器核时钟频率
+
+clock_t start_time, end_time;
+
+/******************************************************************************/
+
+// this constant may need to be adjusted to give reasonable minimum times
+// For best results, times should be about 1.0 seconds for the minimum test run
+int iterations = 10;
+
+
+// 4000 items, or about 32k of data
+// this is intended to remain within the L2 cache of most common CPUs
+const int SIZE = 4000;
+
+
+// initial value for filling our arrays, may be changed from the command line
+double init_value = 2.1;
+
+/******************************************************************************/
+/******************************************************************************/
+
+void fill_f16(float * first, float * last, float value) {
+    while (first != last) *first++ = (float)(value);
+}
+
+void fill_f32(double * first, double * last, double value) {
+    while (first != last) *first++ = (double)(value);
+}
+
+
+void testOneFunction_f16(const float* first, const int count, const char * label) {
+    int i;
+
+    start_time = clock();
+
+    for(i = 0; i < iterations; ++i) {
+    
+        float result = (float)(1);
+        for (int j = 0; j < count; ++j) {
+            result = result * first[j];
+        }
+       
+        if ( fabs( result - pow(init_value,(double)SIZE) ) > 1.0e-6 ) 
+            printf("test %s failed\n", label);
+    }
+
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+
+    printf("\"%s, %d items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+void testOneFunction_f32(const double* first, const int count, const char * label) {
+    int i;
+
+    start_time = clock();
+
+    for(i = 0; i < iterations; ++i) {
+    
+        double result = (double)(1);
+        for (int j = 0; j < count; ++j) {
+            result = result * first[j];
+        }
+        
+        if ( fabs( result - pow(init_value,(double)SIZE) ) > 1.0e-6 ) 
+            printf("test %s failed\n", label);
+    }
+
+    // need the labels to remain valid until we print the summary
+    end_time = clock();
+
+    double time_cost = (end_time - start_time)/ (double)(CLOCKS_PER_SEC);
+
+    printf("\"%s, %d items\"  %f sec\n",
+        label,
+        count,
+        time_cost);
+}
+
+/******************************************************************************/
+void TestOneType_f16()
+{
+
+    float data[SIZE];
+
+    fill_f16(data, data+SIZE, (float)(init_value));
+    
+    testOneFunction_f16( data, SIZE, "float product sequence1" );
+
+}
+
+void TestOneType_f32()
+{
+
+    double data[SIZE];
+
+    fill_f32(data, data+SIZE, (double)(init_value));
+    
+    testOneFunction_f32( data, SIZE, "double product sequence1" );
+
+}
+
+/******************************************************************************/
+/******************************************************************************/
+
+int main(int argc, char** argv) {
+
+    // output command for documentation:
+    int i;
+    // for (i = 0; i < argc; ++i)
+    //     printf("%s ", argv[i] );
+    // printf("\n");
+
+    if (argc > 1) iterations = atoi(argv[1]);
+    if (argc > 2) init_value = (double) atof(argv[2]);
+
+
+    TestOneType_f16();
+
+    TestOneType_f32();
+
+
+    return 0;
+}
+
+// the end
+/******************************************************************************/
+/******************************************************************************/