179 lines
6.8 KiB
C
179 lines
6.8 KiB
C
#include "AccelConvFuncs.h"
|
||
|
||
void accel_conv_func(int8_t* d_i, int8_t* weight, int32_t* bias, int32_t* d_o, int8_t di_type,
|
||
int height, int width, int padding, int channel, int out_channel,
|
||
int kernel_width, int kernel_height, int with_relu) {
|
||
accel_conf_t accel_conf;
|
||
accel_conf.buf_size = di_type;
|
||
accel_conf.weight_size = 0;
|
||
accel_conf.padding = padding;
|
||
accel_conf.stride = 1;
|
||
accel_conf.kernel_height = kernel_height;
|
||
accel_conf.kernel_width = kernel_width;
|
||
|
||
int size_mul = di_type == 0b10 ? 4 :
|
||
di_type == 0b01 ? 2 : 1;
|
||
int buf_mul = di_type == 0b10 ? 1 :
|
||
di_type == 0b01 ? 2 : 4;
|
||
|
||
int del_h = kernel_height - 1;
|
||
int del_w = (kernel_width - 1) * size_mul;
|
||
|
||
int size_sft = size_mul >> 1;
|
||
int buf_sft = buf_mul >> 1;
|
||
int width_i = width * size_mul;
|
||
int height_o = (height - kernel_height + padding * 2) + 1;
|
||
int width_o = ((width - kernel_width + padding * 2) + 1);
|
||
|
||
|
||
int h_append = 0;
|
||
int res_h = 0;
|
||
for (int h = 0; h < height; h += h_append) {
|
||
int h_eq_0 = h == 0;
|
||
h_append = h_eq_0 ? MAX_BUF_HEIGHT : MAX_BUF_HEIGHT - del_h;
|
||
int h_remain = height - h;
|
||
int buf_h_end = h_remain <= h_append;
|
||
int buf_h;
|
||
int begin_h = h_eq_0 ? 0 : h - del_h;
|
||
if (buf_h_end) {
|
||
if (h_eq_0) buf_h = h_remain;
|
||
else buf_h = h_remain + del_h;
|
||
} else {
|
||
buf_h = MAX_BUF_HEIGHT;
|
||
}
|
||
accel_conf.buf_height = buf_h - 1;
|
||
int res_h_append = (buf_h - kernel_height + (h_eq_0 + buf_h_end) * padding) + 1;
|
||
|
||
int w_append = 0;
|
||
int res_w = 0;
|
||
for (int w = 0; w < width_i; w += w_append) {
|
||
int w_eq_0 = w == 0;
|
||
w_append = w_eq_0 ? MAX_BUF_WIDTH : MAX_BUF_WIDTH - del_w;
|
||
int w_remain = width_i - w;
|
||
int buf_w_end = w_remain <= w_append;
|
||
int buf_w;
|
||
int begin_w = w_eq_0 ? 0 : w - del_w;
|
||
if (buf_w_end) {
|
||
if (w_eq_0) {
|
||
buf_w = w_remain;
|
||
} else {
|
||
buf_w = (w_remain + del_w);
|
||
}
|
||
} else {
|
||
buf_w = MAX_BUF_WIDTH;
|
||
}
|
||
int padding_valid = (w_eq_0) |
|
||
(buf_w_end << 1) |
|
||
(h_eq_0 << 2) |
|
||
(buf_h_end << 3);
|
||
int buf_offset = width_i - buf_w;
|
||
int res_w_append = ((buf_w >> size_sft) - kernel_width + (w_eq_0 + buf_w_end) * padding) + 1;
|
||
int res_offset = (width_o - res_w_append) << size_sft;
|
||
int res_buf_info = ((res_w_append - 1) << 16) | res_offset;
|
||
inst_conf_offset(buf_offset, res_buf_info);
|
||
|
||
accel_conf.buf_width = buf_w - 1;
|
||
accel_conf.padding_valid = padding_valid;
|
||
for (int k = 0; k < channel; k++) {
|
||
accel_conf.buf_refresh = 1;
|
||
accel_conf.wadd = k != 0;
|
||
for (int c = 0; c < out_channel; c += MAX_KERNEL_NUM) {
|
||
int valid_kernel_num = c + MAX_KERNEL_NUM > out_channel ? out_channel - c : MAX_KERNEL_NUM;
|
||
accel_conf.kernel_num = valid_kernel_num - 1;
|
||
inst_conf_buf((uint32_t)&d_i[(k * height + begin_h) * width_i + begin_w], accel_conf.val);
|
||
for (int vc = 0; vc < valid_kernel_num; vc++) {
|
||
int ca = c + vc;
|
||
inst_conf_res_addr(&d_o[(ca * height_o + res_h) * width_o + res_w], vc);
|
||
|
||
if (k == 0) {
|
||
inst_conf_res_bias(bias[ca], vc);
|
||
// for (int m = 0; m < res_h_append; m++) {
|
||
// for (int n = 0; n < res_w_append; n++) {
|
||
// d_o[(ca * height_o + res_h + m) * width_o + res_w + n] = bias[ca];
|
||
// }
|
||
// }
|
||
}
|
||
}
|
||
if (k == channel - 1 && with_relu) {
|
||
inst_conv_relu(&weight[(k * out_channel + c) * kernel_height * kernel_width]);
|
||
} else {
|
||
inst_conv(&weight[(k * out_channel + c) * kernel_height * kernel_width]);
|
||
}
|
||
accel_conf.buf_refresh = 0;
|
||
}
|
||
}
|
||
res_w += res_w_append;
|
||
}
|
||
res_h += res_h_append;
|
||
}
|
||
}
|
||
|
||
// height and width must align 2
|
||
void accel_pool_func(int32_t* d_i, int32_t* d_o, int pool_mode, int channel, int height, int width) {
|
||
accel_conf_t conf;
|
||
conf.buf_refresh = 1;
|
||
conf.wadd = 0;
|
||
conf.padding_valid = 0;
|
||
conf.weight_size = 2;
|
||
conf.buf_size = 2;
|
||
conf.kernel_width = 2;
|
||
conf.kernel_height = 2;
|
||
conf.stride = 2;
|
||
conf.kernel_num = 0;
|
||
conf.padding = 0;
|
||
|
||
int max_width_word = MAX_BUF_WIDTH >> 2;
|
||
|
||
// for (int i = 0; i < height; i++) {
|
||
// for(int j = 0; j < width; j++) {
|
||
// printf("%x ", d_i[i * width + j]);
|
||
// }
|
||
// printf("\n");
|
||
// }
|
||
|
||
int width_o = width >> 1;
|
||
int height_o = height >> 1;
|
||
|
||
for (int c = 0; c < channel; c++) {
|
||
int h_append;
|
||
int res_h = 0;
|
||
for (int h = 0; h < height; h += h_append) {
|
||
h_append = height - h < MAX_BUF_HEIGHT ? height - h : MAX_BUF_HEIGHT;
|
||
int buf_h = h_append;
|
||
conf.buf_height = buf_h - 1;
|
||
int res_h_append = buf_h >> 1;
|
||
|
||
int w_append;
|
||
int res_w = 0;
|
||
for (int w = 0; w < width; w += w_append) {
|
||
int w_remain = width - w;
|
||
w_append = w_remain < max_width_word ? w_remain : max_width_word;
|
||
int buf_w = w_append;
|
||
conf.buf_width = (buf_w << 2) - 1;
|
||
|
||
inst_conf_buf((uint32_t)&d_i[(c * height + h) * width + w], conf.val);
|
||
inst_conf_res_addr((uint32_t)&d_o[(c * height_o + res_h) * width_o + res_w], 0);
|
||
int buf_offset = (width - buf_w) << 2;
|
||
int res_w_append = (buf_w >> 1);
|
||
int res_offset = (width_o - res_w_append) << 2;
|
||
int res_buf_info = ((res_w_append - 1) << 16) | res_offset;
|
||
inst_conf_offset(buf_offset, res_buf_info);
|
||
inst_pool(pool_mode);
|
||
|
||
res_w += res_w_append;
|
||
}
|
||
res_h += res_h_append;
|
||
}
|
||
}
|
||
}
|
||
|
||
void accel_linear_func(int32_t* d_i, int8_t* weight, int32_t* bias,
|
||
int32_t* output, int input_size, int output_size) {
|
||
// 线性层的瓶颈在于访存,cpu可以处理计算
|
||
for (int i = 0; i < output_size; i++) {
|
||
output[i] = bias[i];
|
||
for (int j = 0; j < input_size; j++) {
|
||
output[i] += d_i[j] * weight[i * input_size + j];
|
||
}
|
||
}
|
||
} |