0%

linux cryptoapi 适配

主要介绍怎样将crypto hardware accelerate 移植到 linux crypto api中
参考项主要是 stm32的实现

crypto中移植aes 引擎

linux中 hardware ip 是以驱动的方式加载到kernel中的
先看一下驱动的定义

1
2
3
4
5
6
7
8
9
10
11
12
13
14
static const struct of_device_id stm32_dt_ids[] = {
{ .compatible = "st,stm32f756-cryp", .data = &f7_data},
{ .compatible = "st,stm32mp1-cryp", .data = &mp1_data},
{},
};
static struct platform_driver stm32_cryp_driver = {
    .probe  = stm32_cryp_probe,
    .remove = stm32_cryp_remove,
    .driver = {
        .name           = DRIVER_NAME,
        .pm     = &stm32_cryp_pm_ops,
        .of_match_table = stm32_dt_ids,
    },
};

stm32的加密引擎是作为platform driver.


1
2
3
4
5
6
7
13	crypto@50060000 {
14 compatible = "st,stm32f756-cryp";
15 reg = <0x50060000 0x400>;
16 interrupts = <79>;
17 clocks = <&rcc 0 STM32F7_AHB2_CLOCK(CRYP)>;
18 resets = <&rcc STM32F7_AHB2_RESET(CRYP)>;
19 };

通过platform_driver_register 或者 module_init 注册驱动时, 会将driver 和设备绑定, 自动回调probe接口
一个driver可以同多个设备绑定, 每次绑定都会回调probe接口.

寻找设备的过程, 一般就是在device tree 中通过 of_match_table 寻找匹配的设备的过程. 匹配到后, 根据device tree中的设备信息初始化 driver中dev相关的结构体信息

probe 注册过程

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
static int stm32_cryp_probe(struct platform_device *pdev)
{
struct device *dev = &pdev->dev;
struct stm32_cryp *cryp;
struct reset_control *rst;
int irq, ret;

cryp = devm_kzalloc(dev, sizeof(*cryp), GFP_KERNEL);
if (!cryp)
return -ENOMEM;

cryp->caps = of_device_get_match_data(dev);
if (!cryp->caps)
return -ENODEV;

cryp->dev = dev;
// 匹配到device tree中设备描述的regs
cryp->regs = devm_platform_ioremap_resource(pdev, 0);
if (IS_ERR(cryp->regs))
return PTR_ERR(cryp->regs);
// 匹配到device tree中设备描述的irq
irq = platform_get_irq(pdev, 0);
if (irq < 0)
return irq;
// 配置中断
ret = devm_request_threaded_irq(dev, irq, stm32_cryp_irq,
stm32_cryp_irq_thread, IRQF_ONESHOT,
dev_name(dev), cryp);
if (ret) {
dev_err(dev, "Cannot grab IRQ\n");
return ret;
}
// 匹配到device tree中设备描述的clk, 实际上是ahb clk的位偏移
cryp->clk = devm_clk_get(dev, NULL);
if (IS_ERR(cryp->clk)) {
dev_err(dev, "Could not get clock\n");
return PTR_ERR(cryp->clk);
}
// 开启cryp时钟
ret = clk_prepare_enable(cryp->clk);
if (ret) {
dev_err(cryp->dev, "Failed to enable clock\n");
return ret;
}
//... 忽略电源管理相关
pm_runtime...

// 匹配到device tree中设备描述的 resets
rst = devm_reset_control_get(dev, NULL);
// 时钟复位, 先开再关
if (!IS_ERR(rst)) {
reset_control_assert(rst);
udelay(2);
reset_control_deassert(rst);
}
// 在probe()函数中动态申请设备结构体 cryp,并初始化它,然后使用platform_set_drvdata()将其保存到platform_device
platform_set_drvdata(pdev, cryp);

spin_lock(&cryp_list.lock);
// 保存到 cryp_list的 dev_list列表中, 这个结构用来找cryp 结构体
list_add(&cryp->list, &cryp_list.dev_list);
spin_unlock(&cryp_list.lock);

/* Initialize crypto engine */
// 初始化 engine, 填充相关结构, 并初始化worker线程, 这个后面详细分析下, 见 --> 1.1
cryp->engine = crypto_engine_alloc_init(dev, 1);
if (!cryp->engine) {
dev_err(dev, "Could not init crypto engine\n");
ret = -ENOMEM;
goto err_engine1;
}
// 见 --> 1.2
ret = crypto_engine_start(cryp->engine);
if (ret) {
dev_err(dev, "Could not start crypto engine\n");
goto err_engine2;
}
// 见 --> 1.3
ret = crypto_register_skciphers(crypto_algs, ARRAY_SIZE(crypto_algs));
if (ret) {
dev_err(dev, "Could not register algs\n");
goto err_algs;
}

// 见 --> 1.4
ret = crypto_register_aeads(aead_algs, ARRAY_SIZE(aead_algs));
if (ret)
goto err_aead_algs;

dev_info(dev, "Initialized\n");

return 0;
...

return ret;
}

1.1 crypto_engine_alloc_init

–> crypto_engine_alloc_init

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
/**
* crypto_engine_alloc_init - allocate crypto hardware engine structure and
* initialize it.
* @dev: the device attached with one hardware engine
* @rt: whether this queue is set to run as a realtime task
*
* This must be called from context that can sleep.
* Return: the crypto engine structure on success, else NULL.
*/
struct crypto_engine *crypto_engine_alloc_init(struct device *dev, bool rt)
{
// 进程优先级
struct sched_param param = { .sched_priority = MAX_RT_PRIO / 2 };
struct crypto_engine *engine;

if (!dev)
return NULL;
// 初始化分配内存 分配的内存会和pdev->dev绑定, devm_kzalloc()是跟设备(device)有关的,当设备(device)被detached或者驱动(driver)卸载(unloaded)时,内存会被自动释放。
engine = devm_kzalloc(dev, sizeof(*engine), GFP_KERNEL);
if (!engine)
return NULL;

engine->dev = dev; //指向绑定的dev
engine->rt = rt; // 调度相关, 是否是实时任务, 这里是 1
engine->running = false; //the engine is on working
engine->busy = false; //request pump is busy
engine->idling = false; // the engine is entering idle state
engine->cur_req_prepared = false; //current request is prepared
engine->priv_data = dev; //the engine private data
snprintf(engine->name, sizeof(engine->name),
"%s-engine", dev_name(dev)); // the engine name

crypto_init_queue(&engine->queue, CRYPTO_ENGINE_MAX_QLEN); // the crypto queue of the engine
spin_lock_init(&engine->queue_lock); //queue的锁
// kthread worker struct for request pump, 再cpu0上创建内核工作线程 worker.
// kthread_worker_fn 循环监听是否有工作需要处理
engine->kworker = kthread_create_worker(0, "%s", engine->name);
kthread_init_work(&engine->pump_requests, crypto_pump_work); //work struct for scheduling work to the request pump

if (engine->rt) {
dev_info(dev, "will run requests pump with realtime priority\n");
sched_setscheduler(engine->kworker->task, SCHED_FIFO, &param); //修改task的优先级为实时的
}

return engine;
}

kthread_worker 与 kthread_work

内核线程创建函数创建一个内核线程,它判断属于这个线程的kthread_worker中是否有要处理的kthread_work,如果有,就取出这个kthread_work,然后调用kthread_work上面指定的处理函数,如果没有这个线程就进行休眠,当有新的kthread_work添加到kthread_worker上时,会再次唤醒kthread_worker的处理线程重复上述工作。

engine->pump_requests 是 kthread_work, crypto_pump_work 则是该work上的指定的处理函数
engine->kworker 是kthread_worker

上面crypto_engine_alloc_init的过程只是初始化了crypto engine相关的数据, 分配了内存, 创建了worker与work, 但worker上还没有work, worker与work并没有关联起来

1.2 crypto_engine_start(cryp->engine)

–> 1.2 crypto_engine_start

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
int crypto_engine_start(struct crypto_engine *engine)
{
unsigned long flags;
// 持锁并禁用中断
spin_lock_irqsave(&engine->queue_lock, flags);

if (engine->running || engine->busy) {
spin_unlock_irqrestore(&engine->queue_lock, flags);
return -EBUSY;
}
// 持锁的目的是为了原子性修改 running变量, 此时标记为true, 表示engine start
engine->running = true;
spin_unlock_irqrestore(&engine->queue_lock, flags);
// 将work给到worker, worker执行 crypto_pump_work 处理函数
kthread_queue_work(engine->kworker, &engine->pump_requests);
return 0;
}

crypto_pump_work

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
static void crypto_pump_work(struct kthread_work *work)
{
// pump_requests 是 kthread_work, 通过 pump_requests 找到 crypto_engine的首地址, 当前指针减去 pump_requests在crypto_engine结构体中的偏移量即可得到 engine的首地址
struct crypto_engine *engine =
container_of(work, struct crypto_engine, pump_requests);

crypto_pump_requests(engine, true);
}

static void crypto_pump_requests(struct crypto_engine *engine,
bool in_kthread)
{
struct crypto_async_request *async_req, *backlog;
struct crypto_engine_ctx *enginectx;

// 持锁并关当前cpu的中断
spin_lock_irqsave(&engine->queue_lock, flags);

/* If another context is idling then defer */
// 说明有其他线程抢先进到了idling状态, 延后执行
if (engine->idling) {
kthread_queue_work(engine->kworker, &engine->pump_requests);
goto out;
}

/* Check if the engine queue is idle */
if (!crypto_queue_len(&engine->queue) || !engine->running) {
// ... 检查当前状态不对, 退出, 应该queue上有request 且engine running是true才对
}
... // 这里先省略, 因为queue上还没有request, 后面再看这部分
}

这里先知道当work被推给worker后, worker 会执行work上指定的处理函数, 该处理函数主要的作用就是处理engine->queue上的消息队列即可, 至于消息是怎样推到queue上的以及queue的结构是什么样的, 后面等到有消息处理时再分解

1.3 crypto_register_skciphers

–> 1.3 crypto_register_skciphers

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
static struct skcipher_alg crypto_algs[] = {
{
.base.cra_name = "ecb(aes)",
.base.cra_driver_name = "stm32-ecb-aes",
.base.cra_priority = 200,
.base.cra_flags = CRYPTO_ALG_ASYNC,
.base.cra_blocksize = AES_BLOCK_SIZE,
.base.cra_ctxsize = sizeof(struct stm32_cryp_ctx),
.base.cra_alignmask = 0xf,
.base.cra_module = THIS_MODULE,

.init = stm32_cryp_init_tfm,
.min_keysize = AES_MIN_KEY_SIZE,
.max_keysize = AES_MAX_KEY_SIZE,
.setkey = stm32_cryp_aes_setkey,
.encrypt = stm32_cryp_aes_ecb_encrypt,
.decrypt = stm32_cryp_aes_ecb_decrypt,
},

crypto_register_skcipher(&algs[i]);
skcipher_prepare_alg(alg);
crypto_register_alg(alg->base);
...

遍历 crypto_algs数组 注册crypto 算法, 这里用了linux crypto api的通用结构, 目的估计是向系统中注册对应的算法, kernel中的其他模块可以通过名字找到该算法, 调用其对应的init setkey encrypt等函数指针
crypto_skcipher_init_tfm crypto_alloc_skcipher crypto_skcipher_setkey crypto_skcipher_encrypt crypto_skcipher_decrypt

crypto_alloc_skcipher 通过 alg_name 查找到注册的算法, 最后会调用其init的函数指针.

1
2
3
4
5
6
7
8
9
10
11
12
struct crypto_skcipher *crypto_alloc_skcipher(const char *alg_name,
u32 type, u32 mask)
{
return crypto_alloc_tfm(alg_name, &crypto_skcipher_type, type, mask);
}
+ --+ crypto_alloc_tfm
\ -- + crypto_find_alg(alg_name, frontend, type, mask)
\ -- crypto_alg_mod_lookup(alg_name, type, mask)
| -- + crypto_create_tfm(alg, frontend)
\ -- + frontend->init_tfm(tfm) === crypto_skcipher_type--> init_tfm === crypto_skcipher_init_tfm
\ -- alg->init(skcipher) === stm32_cryp_init_tfm
| -- alg->cra_init(tfm) // 这里stm cryp的cra_init 应该是没有定义

这里需要顺带看下 init 做了什么

stm32_cryp_init_tfm

1
2
3
4
5
6
7
8
9
10
11
12
static int stm32_cryp_init_tfm(struct crypto_skcipher *tfm)
{
// operational context of the transformation
struct stm32_cryp_ctx *ctx = crypto_skcipher_ctx(tfm);

crypto_skcipher_set_reqsize(tfm, sizeof(struct stm32_cryp_reqctx));
// 主要是注册 这几个op的函数指针, 这几个函数指针在后面处理request会用到
ctx->enginectx.op.do_one_request = stm32_cryp_cipher_one_req;
ctx->enginectx.op.prepare_request = stm32_cryp_prepare_cipher_req;
ctx->enginectx.op.unprepare_request = NULL;
return 0;
}

1.4 crypto_register_aeads

–> 1.4 crypto_register_aeads(aead_algs, ARRAY_SIZE(aead_algs));
注册aead_algs(带关联认证的加密算法), 这里不展开了, 同 crypto_register_skciphers 过程差不多, 只是要注意为什么区分成了 aeads, type由 crypto_skcipher_type 变成了crypto_aead_type
有需要再看, 这里CCM GCM 需要通过这个接口实现

request 处理

crypto_skcipher_encrypt

跟一下这个过程, 看下request 路由

1
2
3
4
5
6
7
8
struct skcipher_request {
unsigned int cryptlen;
u8 *iv;
struct scatterlist *src;
struct scatterlist *dst;
struct crypto_async_request base;
void *__ctx[] CRYPTO_MINALIGN_ATTR;
};

skcipher_request_set_crypt(req, sgt.sgl, sgt.sgl, crypt_len, iv);
request 封装包含 len ,src dst,iv 等. key并不在里面

1
2
3
4
5
6
7
8
9
10
--+ skcipher_request_set_crypt
\--+ crypto_skcipher_alg(tfm)->encrypt(req);
\--+ stm32_cryp_aes_ecb_encrypt(req)
\--+ stm32_cryp_crypt(req, FLG_AES | FLG_ECB | FLG_ENCRYPT);
\ -- 找到 之前注册的 stm32_cryp_ctx, 并从 cryp_list 中找到probe时添加的cryp结构体指针, 最终赋值给 stm32_cryp_ctx 的ctx->cryp
|--+ crypto_transfer_skcipher_request_to_engine(cryp->engine, req)
\--+ crypto_transfer_request_to_engine(engine, &req->base)
\--+ crypto_transfer_request(engine, req, true)
\-- crypto_enqueue_request(&engine->queue, req) #"入队" 见 --> 2.1 入队分析
|-- kthread_queue_work(engine->kworker, &engine->pump_requests); #"work 推给worker"

crypto_enqueue_request

–> 2.1 入队分析

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
int crypto_enqueue_request(struct crypto_queue *queue,
struct crypto_async_request *request)
{
int err = -EINPROGRESS;
// queue 满了, 如果带了 CRYPTO_TFM_REQ_MAY_BACKLOG , 会接着入队
if (unlikely(queue->qlen >= queue->max_qlen)) {
if (!(request->flags & CRYPTO_TFM_REQ_MAY_BACKLOG)) {
err = -ENOSPC;
goto out;
}
err = -EBUSY;
// backlog 如果指向queue 首, 且queue满了, 则将其指向req的list 首
if (queue->backlog == &queue->list)
queue->backlog = &request->list;
}

queue->qlen++;
// 将request 队首指针加入到queue list中, 队尾入队
list_add_tail(&request->list, &queue->list);
out:
return err;
}

此时因为work推给了worker, worker会运行 work的处理函数 crypto_pump_work
接着分析crypto_pump_work的后半部分

crypto_pump_work 后半部分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
	/* Get the fist request from the engine queue to handle */
// 这个地方应该只有 queue->backlog = &request->list; 时backlog才不为空, 否则应该为空
// 也就是可以理解为只有queue满时才会执行 backlog->complete, 而complete是发送request时设置的回调
backlog = crypto_get_backlog(&engine->queue);
async_req = crypto_dequeue_request(&engine->queue);
if (!async_req)
goto out;
// 队首出队, 获得request
engine->cur_req = async_req;
if (backlog)
backlog->complete(backlog, -EINPROGRESS);

if (engine->busy)
was_busy = true;
else
engine->busy = true;

spin_unlock_irqrestore(&engine->queue_lock, flags);

/* Until here we get the request need to be encrypted successfully */
// 这个地方 engine 应该是没有 prepare_crypt_hardware , 所以先省略
if (!was_busy && engine->prepare_crypt_hardware) {
// 不为busy时, 调用 prepare_crypt_hardware, 此处未注册
ret = engine->prepare_crypt_hardware(engine);
}

enginectx = crypto_tfm_ctx(async_req->tfm);
// 调用 prepare_request --> stm32_cryp_prepare_cipher_req
if (enginectx->op.prepare_request) {
ret = enginectx->op.prepare_request(engine, async_req);
...
engine->cur_req_prepared = true;
}
// 调用 op.do_one_request --> stm32_cryp_cipher_one_req
ret = enginectx->op.do_one_request(engine, async_req);
...// 省略错误处理
return;

req_err:
crypto_finalize_request(engine, async_req, ret);
return;

out:
spin_unlock_irqrestore(&engine->queue_lock, flags);

stm32_cryp_prepare_cipher_req

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
static int stm32_cryp_prepare_req(struct skcipher_request *req,
struct aead_request *areq)
{
rctx->mode &= FLG_MODE_MASK;
ctx->cryp = cryp;
cryp->flags = (cryp->flags & ~FLG_MODE_MASK) | rctx->mode;
cryp->hw_blocksize = is_aes(cryp) ? AES_BLOCK_SIZE : DES_BLOCK_SIZE;
cryp->ctx = ctx;

if (req) {
cryp->req = req;
cryp->areq = NULL;
cryp->total_in = req->cryptlen;
cryp->total_out = cryp->total_in;
}
// ... 填充 cryp 结构
cryp->in_sg = req ? req->src : areq->src;
cryp->out_sg = req ? req->dst : areq->dst;
cryp->out_sg_save = cryp->out_sg;
// 最终调用硬件相关, 设置寄存器, 初始化 cryp->regs 的相关寄存器
ret = stm32_cryp_hw_init(cryp);
}

stm32_cryp_cipher_one_req

1
2
3
4
5
6
7
8
9
10
11
static int stm32_cryp_cipher_one_req(struct crypto_engine *engine, void *areq)
{
struct skcipher_request *req = container_of(areq,
struct skcipher_request,
base);
struct stm32_cryp_ctx *ctx = crypto_skcipher_ctx(
crypto_skcipher_reqtfm(req));
struct stm32_cryp *cryp = ctx->cryp;
// 调用硬件相关, 启动cryp, 开启中断, 由中断接收处理数据
return stm32_cryp_cpu_start(cryp);
}

中断处理

接下来需要看下中断怎么处理 cryp 硬件的执行流的
在probe 时设置了中断处理函数为 stm32_cryp_irq

1
2
3
ret = devm_request_threaded_irq(dev, irq, stm32_cryp_irq,
stm32_cryp_irq_thread, IRQF_ONESHOT,
dev_name(dev), cryp);

irq_handler 为 stm32_cryp_irq, thread_fn为stm32_cryp_irq_thread

这里分中断上半部, 下半部, 中断到来后由irq_handler -> stm32_cryp_irq 处理中断, 设置cryp->irq_status = stm32_cryp_read(cryp, CRYP_MISR)返回 IRQ_WAKE_THREAD,
唤醒中断处理线程, 到下半部, 由thread_fn -> stm32_cryp_irq_thread 处理后续的工作

stm32_cryp_irq_thread

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
static irqreturn_t stm32_cryp_irq_thread(int irq, void *arg)
{
struct stm32_cryp *cryp = arg;
u32 ph;
// 由irq_status状态判断中断类型, 驱动是要入数据还是出数据, 出数据即fifo满了, 算出的数据输出, 入数据即fifo空了, 要往里填数据了
if (cryp->irq_status & MISR_OUT)
/* Output FIFO IRQ: read data */
if (unlikely(stm32_cryp_irq_read_data(cryp))) {
/* All bytes processed, finish */
stm32_cryp_write(cryp, CRYP_IMSCR, 0);
stm32_cryp_finish_req(cryp, 0);
return IRQ_HANDLED;
}

if (cryp->irq_status & MISR_IN) {
...// 省略ccm 与 gcm处理
} else {
/* Input FIFO IRQ: write data */
stm32_cryp_irq_write_data(cryp);
}
}

return IRQ_HANDLED;
}

可见这里并没有用dma的方式

使用dma的例子

ux500
这里只提一点, dma的callback 并不是用的当前硬件ip的中断,
dma每个通道都绑定了一个中断处理函数, 应该是这个处理函数 处理的callback

1
2
3
4
5
6
7
8
desc = dmaengine_prep_slave_sg(channel,
ctx->device->dma.sg_dst,
ctx->device->dma.sg_dst_len,
DMA_DEV_TO_MEM,
DMA_CTRL_ACK |
DMA_PREP_INTERRUPT);

desc->callback = cryp_dma_out_callback;

callback调用 complete 函数, 唤醒之前因调用wait_for_completion 而阻塞的线程

1
2
3
4
5
6
7
static void cryp_dma_out_callback(void *data)
{
struct cryp_ctx *ctx = (struct cryp_ctx *) data;
dev_dbg(ctx->device->dev, "[%s]: ", __func__);

complete(&ctx->device->dma.cryp_dma_complete);
}

ablk_dma_crypt 函数中, 设置完dma 传输后, 最终设置了wait_for_completion(&ctx->device->dma.cryp_dma_complete); 阻塞了当前线程, 在dma完成传输后会调用对应channel的中断处理函数, 调用callback 中的 complete 唤醒当前线程. 当然callback需要自己设置.

君正x2000的aes 驱动

使用dma 时要注意物理地址与虚拟地址之间的转换, 因为DMA只认识物理地址.

看下x2000 的aes驱动的实现全是dma方案.

ecb 加密

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
static int ingenic_aes_start(struct ingenic_aes_dev *aes,
struct ablkcipher_request *req)
{
spin_lock_irqsave(&aes->lock, flags);
ret = ablkcipher_enqueue_request(&aes->queue, req);
// 如果时busy状态, 只将request 推入 aes的queue, 待pump_work来处理
if (aes->flags & FLAGS_BUSY) {
spin_unlock_irqrestore(&aes->lock, flags);
return ret;
}
// 处理同步请求
async_req = crypto_dequeue_request(&aes->queue);
if (async_req)
// 设置驱动为阻塞状态
aes->flags |= FLAGS_BUSY;
// 放锁恢复中断
sin_unlock_irqrestore(&aes->lock, flags);
req = ablkcipher_request_cast(async_req);
aes->req = req;
aes->total = req->nbytes;
aes->in_offset = 0;
// in_sg 是跟request 绑定的
aes->in_sg = req->src;
aes->out_offset = 0;
// out_sg 是跟request 绑定的
aes->out_sg = req->dst;
ctx = crypto_ablkcipher_ctx(crypto_ablkcipher_reqtfm(req));
aes->ctx = ctx;
ctx->aes = aes;
// aes写控制寄存器, 这个过程跟在裸机中是差不多的
err = ingenic_aes_write_ctrl(aes);
// 操作dma 这里是重点 1.1 -->
err = ingenic_aes_crypt_dma_start(aes);
ingenic_aes_stop(aes);
ingenic_aes_finish_req(aes, err);
}
1.1 --> ingenic_aes_crypt_dma_start(aes);
static int ingenic_aes_crypt_dma_start(struct ingenic_aes_dev *aes)
{
// ...
struct scatterlist *in_sg, *out_sg;
int len32;
// 如果 in_sg 和 out_sg 都是最后一帧数据, 小于4k
if (sg_is_last(aes->in_sg) && sg_is_last(aes->out_sg)) {
// 判断in_sg 和 out_sg 的对齐是不是一致的
in = IS_ALIGNED((unsigned long)aes->in_sg->offset, sizeof(unsigned long));
out = IS_ALIGNED((unsigned long)aes->out_sg->offset, sizeof(unsigned long));
fast = in && out;
}
if (fast) {
count = min(aes->total, sg_dma_len(aes->in_sg));
count = min(count, sg_dma_len(aes->out_sg));
// 初始化 scatterlist
prep_sgdata(aes->in_sg);
// 参考 https://blog.csdn.net/Wang20122013/article/details/112802013
// 调用cache的flush invalidate等方法, 同时会设置scatterlist链表中每个entry的dma_address, 参考dma_direct_map_sg 函数
// DMA_TO_DEVICE: 内存到外设 CPU 会做cache的flush 动作. 将cache中的数据 刷写到内存
err = dma_map_sg(aes->dev, aes->in_sg, 1, DMA_TO_DEVICE);
// DMA_FROM_DEVICE: 外设到内存 cpu将cache置为无效, 这样dma做完后, cpu cache不命中, 会重新加载内存中的内容
err = dma_map_sg(aes->dev, aes->out_sg, 1, DMA_FROM_DEVICE);
// 获取in_sg的总线地址, dma可以直接访问
addr_in = sg_dma_address(aes->in_sg);
// 获取out_sg的总线地址, dma可以直接访问
addr_out = sg_dma_address(aes->out_sg);
in_sg = aes->in_sg;
out_sg = aes->out_sg;
aes->flags |= FLAGS_FAST;
} else {
/* use cache buffers */
// 不支持fast 时
// 将request in_sg 拷贝到 遍历scatterlist 链表拷贝内容到buf_in中, 注意这里会更新in_sg 为下一个scatterlist 数据帧
count = sg_copy(&aes->in_sg, &aes->in_offset, aes->buf_in,
aes->buflen, aes->total, 0);

len32 = DIV_ROUND_UP(count, DMA_MIN) * DMA_MIN;
// 绑定 buf_in 给 in_sgl, 这里中转了以下, 新建了buf_in buf_out的localbuffer, 并绑定到了in_sgl out_sgl上
sg_init_one(&aes->in_sgl,aes->buf_in,len32);
sg_dma_len(&aes->in_sgl) = len32;
// dma_addr_in 是buf_in的总线地址, dma 可以直接访问该地址, 将它给到 in_sgl的dma_address 成员. dma_addr_out 同理
sg_dma_address(&aes->in_sgl) = aes->dma_addr_in;
sg_init_one(&aes->out_sgl,aes->buf_out,len32);
sg_dma_len(&aes->out_sgl) = len32;
sg_dma_address(&aes->out_sgl) = aes->dma_addr_out;

in_sg = &aes->in_sgl;
out_sg = &aes->out_sgl;

addr_in = aes->dma_addr_in;
addr_out = aes->dma_addr_out;
prep_sgdata(in_sg);
aes->flags &= ~FLAGS_FAST;
// 同dma_map_sg 刷cache
dma_cache_sync(aes->dev, sg_virt(in_sg), len32,DMA_TO_DEVICE);
dma_cache_sync(aes->dev, sg_virt(out_sg),len32,DMA_FROM_DEVICE);
}

aes->total -= count;
//主要是操作aes的dma 相关寄存器, 如 engress outgress 地址, 这两个地址即前面的 dma_addr_in dma_addr_out, 如果是fast模式, 这两个地址是跟着in_sg out_sg 走的, 并没有开辟localbuffer
err = ingenic_aes_crypt_dma(tfm, in_sg, out_sg);
if (err) {
dma_unmap_sg(aes->dev, aes->in_sg, 1, DMA_TO_DEVICE);
dma_unmap_sg(aes->dev, aes->out_sg, 1, DMA_TO_DEVICE);
}
return err;
}

这个地方有疑问的一点, aes->in_sgl aes->buf_in aes->out_sgl aes->buf_out, 这几个实际上只映射了一个page, 那传的数据量很大怎么办呢?
其实这个处理是在中断中做的, 在aes 处理完毕后, 会发送中断, 在中断服务程序中, 就必须处理接下来的数据请求
这里也很好的体现了流式数据的处理策略

接着看一下中断的处理:

处理dma 中断

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
static irqreturn_t ingenic_aec_irqthread(int irq, void *data)
{
val = aes_read(aes,AES_ASSR);
mask = aes_read(aes,AES_ASINTM);
val = val & mask; //读aes的中断, 查数据手册即可, aes status 与 interrupt mask register
// 查询得知 4 是 dma done的中断
if(val & 4){
// 将buf_out dma 喂到的数据传给out_sg, 即 request 2.1 -->
err = ingenic_aes_crypt_dma_stop(aes);
// 清除aes status 的中断位, 表示已经处理完该中断, 可以来下一个中断了
aes_write(aes,AES_ASSR,4);
// 如果请求的total 未完成, 因为这里的策略主要是流式处理的, 对于一次请求(不是只有一帧 request page的情况下), 每次最多只能处理一个page, 而total 是每次会减去一次的处理量
if (aes->total && !err) {
// 进行下一次的dma 处理.
err = ingenic_aes_crypt_dma_start(aes);
if (!err)
// 这里会直接退出, 因为数据还没处理完, 在一个request 数据量很大时, 传入的request 的in_sg 会是一个链表, 包含了很多数据帧, 这里会继续处理后面的帧
return IRQ_HANDLED; /* DMA started. Not fininishing. */
}
// 如果这个request结束了, 才会走到这里
ingenic_aes_finish_req(aes, err);
// 控制aes 从aes->queue中取出下一个request 进行处理
ingenic_aes_start(aes,NULL);
}
}
// 这里主要看一下 ingenic_aes_crypt_dma_stop 函数
2.1 --> static int ingenic_aes_crypt_dma_stop(struct ingenic_aes_dev *aes)
{
// 如果只有一帧 request 请求的话, 已经结束了, 没有下一次了, total 应该判断可以终止了, 这里就unmap调scatterlist entry 就行了
if (aes->flags & FLAGS_FAST)
{
dma_unmap_sg(aes->dev, aes->out_sg, 1, DMA_FROM_DEVICE);
prep_sgdata(aes->out_sg);
dma_unmap_sg(aes->dev, aes->in_sg, 1, DMA_TO_DEVICE);
} else {
// total很大的情况, 此处处于中间处理的状态, 刚结束上一帧数据, 需要处理缓存, 需要将dma 喂到物理内存的数据刷到 cache中, 这里其实什么也没做, 因为前面已经invalidate了, 此处buf_out cache 应该是空的, cpu访问会刷新cache的
dma_sync_single_for_device(aes->dev, aes->dma_addr_out,
aes->dma_size, DMA_FROM_DEVICE);

prep_sgdata(&aes->out_sgl);
//dump_sgdata(&aes->out_sgl);

/* copy data */
// 将这次aes 处理完的数据buf_out dma 喂到的数据传给out_sg, 注意sg_copy 这个函数会更新out_sg为下一帧的地址
count = sg_copy(&aes->out_sg, &aes->out_offset, aes->buf_out,
aes->buflen, aes->dma_size, 1);
}
}

用户空间使用

AF_ALG socket

https://www.kernel.org/doc/html/v4.19/crypto/userspace-if.html

The kernel crypto API is accessible from user space. Currently, the following ciphers are accessible:

  • Message digest including keyed message digest (HMAC, CMAC)
  • Symmetric ciphers
  • AEAD ciphers
  • Random Number Generators

使用对称加密算法的例子:

1
2
3
4
5
struct sockaddr_alg sa = {
.salg_family = AF_ALG,
.salg_type = "skcipher", /* this selects the symmetric cipher */
.salg_name = "cbc(aes)" /* this is the cipher name */
};

cryptodev-linux

cryptodev-linux 是kernel 原生CryptoApi 提供给用户空间使用的可扩展基座.
openssl 编译时通过 加入 -DHAVE_CRYPTODEV 和 -DUSE_CRYPTODEV_DIGESTS 编译选项支持通过cryptodev访问kernel cryptoapi 来进行 hash 加密等, 注意这种方式也不支持非对称加密

openssl

参考 openssl 使用afalg引擎
openssl speed -evp aes-128-cbc -engine afalg -elapsed

代码分析

afalg 作为插件使用, 其实现是在engines/e_afalg.c 中, 编译成 afalg.so 放在/usr/lib/engines-1.1/下

调用setup_engine, 对engine 进行初始化, load 该engine的库

使用时, 需要调用 EVP_CipherInit_ex 获得engine 对应的ciper

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
impl = ENGINE_get_cipher_engine(cipher->nid);
const EVP_CIPHER *c = ENGINE_get_cipher(impl, cipher->nid);
ctx->cipher = cipher = c;

>>> p *cipher
$7 = {
nid = 419,
block_size = 16,
key_len = 16,
iv_len = 16,
flags = 4098,
init = 0xffffb53bd804 <afalg_cipher_init>,
do_cipher = 0xffffb53bd9a4 <afalg_do_cipher>,
cleanup = 0xffffb53bdb2c <afalg_cipher_cleanup>,
ctx_size = 128,
set_asn1_parameters = 0x0,
get_asn1_parameters = 0x0,
ctrl = 0x0,
app_data = 0x0
}

// 以aes 为例子, fn 为 afalg_ciphers
const EVP_CIPHER *ENGINE_get_cipher(ENGINE *e, int nid)
{
const EVP_CIPHER *ret;
ENGINE_CIPHERS_PTR fn = ENGINE_get_ciphers(e);
if (!fn || !fn(e, &ret, NULL, nid)) {
ENGINEerr(ENGINE_F_ENGINE_GET_CIPHER, ENGINE_R_UNIMPLEMENTED_CIPHER);
return NULL;
}
return ret;
}

afalg_ciphers 是在 setup_engine 阶段bind的

1
2
3
4
5
6
7
8
9
10
11
12
#0  bind_afalg (e=0x19209700) at engines/e_afalg.c:715
#1 0x0000ffffa5b72fd4 in bind_helper (e=0x19209700, id=0x19209860 "afalg") at engines/e_afalg.c:755
#2 0x0000ffffa5b73080 in bind_engine (e=0x19209700, id=0x19209860 "afalg", fns=0xffffe9357908) at engines/e_afalg.c:761
#3 0x0000ffffa5eef3bc in dynamic_load (e=0x19209700, ctx=0x1920c000) at crypto/engine/eng_dyn.c:480
#4 0x0000ffffa5eeee80 in dynamic_ctrl (e=0x19209700, cmd=206, i=0, p=0x0, f=0x0) at crypto/engine/eng_dyn.c:336
#5 0x0000ffffa5eee154 in ENGINE_ctrl (e=0x19209700, cmd=206, i=0, p=0x0, f=0x0) at crypto/engine/eng_ctrl.c:174
#6 0x0000ffffa5eee4ac in ENGINE_ctrl_cmd_string (e=0x19209700, cmd_name=0xffffa5ff7010 "LOAD", arg=0x0, cmd_optional=0) at crypto/engine/eng_ctrl.c:289
#7 0x0000ffffa5ef1290 in ENGINE_by_id (id=0xffffe9359f32 "afalg") at crypto/engine/eng_list.c:328
#8 0x0000000000470a54 in setup_engine (engine=0xffffe9359f32 "afalg", debug=0) at apps/apps.c:1300
#9 0x000000000045c978 in speed_main (argc=0, argv=0xffffe9359b40) at apps/speed.c:1814
#10 0x00000000004380a0 in do_cmd (prog=0x19207940, argc=6, argv=0xffffe9359b10) at apps/openssl.c:570
#11 0x0000000000437278 in main (argc=6, argv=0xffffe9359b10) at apps/openssl.c:189

后面所有的运算, 都需要传递ctx, 从ctx 中解出ciper来, ciper 封装了具体的操作, 对应了engine的能力.
同linux crypto api的对接, 是engine的主要工作.

其中ciper中的init 指向 afalg_cipher_init, 这个里面afalg_create_sk会建立同kernel crypto api的socket

1
从目前实现的代码来看, 这个afalg只支持aes-cbc-128 aes-cbc-192 aes-cbc-256
1
2
3
4
5
6
7
8
9
10
11
12
ciphertype = EVP_CIPHER_CTX_nid(ctx);
switch (ciphertype) {
case NID_aes_128_cbc:
case NID_aes_192_cbc:
case NID_aes_256_cbc:
ciphername = "cbc(aes)";
break;
default:
ALG_WARN("%s(%d): Unsupported Cipher type %d\n", __FILE__, __LINE__,
ciphertype);
return 0;
}

支持能力是在load 阶段会调用ENGINE_set_default_ciphers
查询engine 支持的算法

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

int ENGINE_set_default_ciphers(ENGINE *e)
{
if (e->ciphers) {
const int *nids;
// 算法的标识为 nid, 查询engine 支持的算法列表, 返回值放到nids 数组中
// ciphers 为 afalg_ciphers 方法
int num_nids = e->ciphers(e, NULL, &nids, 0);
if (num_nids > 0)
return engine_table_register(&cipher_table,
engine_unregister_all_ciphers, e,
nids, num_nids, 1);
}
return 1;
}

#0 ENGINE_set_default_ciphers (e=0x36e4d700) at crypto/engine/tb_cipher.c:50
#1 0x0000ffffb687a4ec in ENGINE_set_default (e=0x36e4d700, flags=65535) at crypto/engine/eng_fat.c:16
#2 0x0000000000470b1c in setup_engine (engine=0xffffe3d3df32 "afalg", debug=0) at apps/apps.c:1310
#3 0x000000000045c978 in speed_main (argc=0, argv=0xffffe3d3d840) at apps/speed.c:1814
#4 0x00000000004380a0 in do_cmd (prog=0x36e4b940, argc=6, argv=0xffffe3d3d810) at apps/openssl.c:570
#5 0x0000000000437278 in main (argc=6, argv=0xffffe3d3d810) at apps/openssl.c:189

如果不支持某个算法, 则通过impl = ENGINE_get_cipher_engine(cipher->nid); 查询会返回空
所有支持的算法会注册到全局的链表中, ENGINE_get_cipher_engine 就是通过查询这个全局的链表得到的哪个engine支持这个nid标识的算法.

kernel_crypto.drawio