Files
2026-04-12 22:20:18 +08:00

179 lines
6.8 KiB
C
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#include "AccelConvFuncs.h"
void accel_conv_func(int8_t* d_i, int8_t* weight, int32_t* bias, int32_t* d_o, int8_t di_type,
int height, int width, int padding, int channel, int out_channel,
int kernel_width, int kernel_height, int with_relu) {
accel_conf_t accel_conf;
accel_conf.buf_size = di_type;
accel_conf.weight_size = 0;
accel_conf.padding = padding;
accel_conf.stride = 1;
accel_conf.kernel_height = kernel_height;
accel_conf.kernel_width = kernel_width;
int size_mul = di_type == 0b10 ? 4 :
di_type == 0b01 ? 2 : 1;
int buf_mul = di_type == 0b10 ? 1 :
di_type == 0b01 ? 2 : 4;
int del_h = kernel_height - 1;
int del_w = (kernel_width - 1) * size_mul;
int size_sft = size_mul >> 1;
int buf_sft = buf_mul >> 1;
int width_i = width * size_mul;
int height_o = (height - kernel_height + padding * 2) + 1;
int width_o = ((width - kernel_width + padding * 2) + 1);
int h_append = 0;
int res_h = 0;
for (int h = 0; h < height; h += h_append) {
int h_eq_0 = h == 0;
h_append = h_eq_0 ? MAX_BUF_HEIGHT : MAX_BUF_HEIGHT - del_h;
int h_remain = height - h;
int buf_h_end = h_remain <= h_append;
int buf_h;
int begin_h = h_eq_0 ? 0 : h - del_h;
if (buf_h_end) {
if (h_eq_0) buf_h = h_remain;
else buf_h = h_remain + del_h;
} else {
buf_h = MAX_BUF_HEIGHT;
}
accel_conf.buf_height = buf_h - 1;
int res_h_append = (buf_h - kernel_height + (h_eq_0 + buf_h_end) * padding) + 1;
int w_append = 0;
int res_w = 0;
for (int w = 0; w < width_i; w += w_append) {
int w_eq_0 = w == 0;
w_append = w_eq_0 ? MAX_BUF_WIDTH : MAX_BUF_WIDTH - del_w;
int w_remain = width_i - w;
int buf_w_end = w_remain <= w_append;
int buf_w;
int begin_w = w_eq_0 ? 0 : w - del_w;
if (buf_w_end) {
if (w_eq_0) {
buf_w = w_remain;
} else {
buf_w = (w_remain + del_w);
}
} else {
buf_w = MAX_BUF_WIDTH;
}
int padding_valid = (w_eq_0) |
(buf_w_end << 1) |
(h_eq_0 << 2) |
(buf_h_end << 3);
int buf_offset = width_i - buf_w;
int res_w_append = ((buf_w >> size_sft) - kernel_width + (w_eq_0 + buf_w_end) * padding) + 1;
int res_offset = (width_o - res_w_append) << size_sft;
int res_buf_info = ((res_w_append - 1) << 16) | res_offset;
inst_conf_offset(buf_offset, res_buf_info);
accel_conf.buf_width = buf_w - 1;
accel_conf.padding_valid = padding_valid;
for (int k = 0; k < channel; k++) {
accel_conf.buf_refresh = 1;
accel_conf.wadd = k != 0;
for (int c = 0; c < out_channel; c += MAX_KERNEL_NUM) {
int valid_kernel_num = c + MAX_KERNEL_NUM > out_channel ? out_channel - c : MAX_KERNEL_NUM;
accel_conf.kernel_num = valid_kernel_num - 1;
inst_conf_buf((uint32_t)&d_i[(k * height + begin_h) * width_i + begin_w], accel_conf.val);
for (int vc = 0; vc < valid_kernel_num; vc++) {
int ca = c + vc;
inst_conf_res_addr(&d_o[(ca * height_o + res_h) * width_o + res_w], vc);
if (k == 0) {
inst_conf_res_bias(bias[ca], vc);
// for (int m = 0; m < res_h_append; m++) {
// for (int n = 0; n < res_w_append; n++) {
// d_o[(ca * height_o + res_h + m) * width_o + res_w + n] = bias[ca];
// }
// }
}
}
if (k == channel - 1 && with_relu) {
inst_conv_relu(&weight[(k * out_channel + c) * kernel_height * kernel_width]);
} else {
inst_conv(&weight[(k * out_channel + c) * kernel_height * kernel_width]);
}
accel_conf.buf_refresh = 0;
}
}
res_w += res_w_append;
}
res_h += res_h_append;
}
}
// height and width must align 2
void accel_pool_func(int32_t* d_i, int32_t* d_o, int pool_mode, int channel, int height, int width) {
accel_conf_t conf;
conf.buf_refresh = 1;
conf.wadd = 0;
conf.padding_valid = 0;
conf.weight_size = 2;
conf.buf_size = 2;
conf.kernel_width = 2;
conf.kernel_height = 2;
conf.stride = 2;
conf.kernel_num = 0;
conf.padding = 0;
int max_width_word = MAX_BUF_WIDTH >> 2;
// for (int i = 0; i < height; i++) {
// for(int j = 0; j < width; j++) {
// printf("%x ", d_i[i * width + j]);
// }
// printf("\n");
// }
int width_o = width >> 1;
int height_o = height >> 1;
for (int c = 0; c < channel; c++) {
int h_append;
int res_h = 0;
for (int h = 0; h < height; h += h_append) {
h_append = height - h < MAX_BUF_HEIGHT ? height - h : MAX_BUF_HEIGHT;
int buf_h = h_append;
conf.buf_height = buf_h - 1;
int res_h_append = buf_h >> 1;
int w_append;
int res_w = 0;
for (int w = 0; w < width; w += w_append) {
int w_remain = width - w;
w_append = w_remain < max_width_word ? w_remain : max_width_word;
int buf_w = w_append;
conf.buf_width = (buf_w << 2) - 1;
inst_conf_buf((uint32_t)&d_i[(c * height + h) * width + w], conf.val);
inst_conf_res_addr((uint32_t)&d_o[(c * height_o + res_h) * width_o + res_w], 0);
int buf_offset = (width - buf_w) << 2;
int res_w_append = (buf_w >> 1);
int res_offset = (width_o - res_w_append) << 2;
int res_buf_info = ((res_w_append - 1) << 16) | res_offset;
inst_conf_offset(buf_offset, res_buf_info);
inst_pool(pool_mode);
res_w += res_w_append;
}
res_h += res_h_append;
}
}
}
void accel_linear_func(int32_t* d_i, int8_t* weight, int32_t* bias,
int32_t* output, int input_size, int output_size) {
// 线性层的瓶颈在于访存cpu可以处理计算
for (int i = 0; i < output_size; i++) {
output[i] = bias[i];
for (int j = 0; j < input_size; j++) {
output[i] += d_i[j] * weight[i * input_size + j];
}
}
}