本文将结合kernel v5.15-rc6,记录下FPU相关的笔记。

建议先阅读Notes about XSAVE feature set

TIF_NEED_FPU_LOAD

  • If TIF_NEED_FPU_LOAD is cleared then the CPU’s FPU registers hold current thread’s FPU registers.

  • If TIF_NEED_FPU_LOAD is set then CPU’s FPU registers may not hold current()’s FPU registers. It is required to load the registers before returning to userland or using the content otherwise.

schedule

1
2
3
4
5
6
7
8
schedule(kernel/sched/core.c)	
__schedule(kernel/sched/core.c)
pick_next_task(kernel/sched/core.c)
context_switch(kernel/sched/core.c)
switch_to(arch/x86/include/asm/switch_to.h)
__switch_to_asm(arch/x86/entry/entry_64.S)
__switch_to(arch/x86/kernel/process_64.c)
switch_fpu_prepare

Defer FPU state load until return to userspace

https://lore.kernel.org/lkml/20181107194858.9380-24-bigeasy@linutronix.de/

Idea:Defer loading of FPU state until return to userspace. This gives the kernel the potential to skip loading FPU state for tasks that stay in kernel mode.

  • save the FPU register into xsave area for the previous task when task switch occurs
1
2
3
4
5
6
7
// https://elixir.bootlin.com/linux/v5.15-rc6/source/arch/x86/kernel/process_64.c#L568
__switch_to
switch_fpu_prepare
save_fpregs_to_fpstate

if (!test_thread_flag(TIF_NEED_FPU_LOAD))
switch_fpu_prepare(prev_fpu, cpu);

为什么要加!test_thread_flag(TIF_NEED_FPU_LOAD)这一判断呢?
因为只有当前cpu fpu的寄存器状态属于previous task时,才能调用switch_fpu_prepare。

  • set TIF_NEED_FPU_LOAD for the next task when task switch occurs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
// https://elixir.bootlin.com/linux/v5.15-rc6/source/arch/x86/kernel/process_64.c#L623
__switch_to
switch_fpu_finish(next_fpu);

// https://elixir.bootlin.com/linux/v5.15-rc6/source/arch/x86/include/asm/fpu/internal.h#L534
/*
* Delay loading of the complete FPU state until the return to userland.
* PKRU is handled separately.
*/
static inline void switch_fpu_finish(struct fpu *new_fpu)
{
if (cpu_feature_enabled(X86_FEATURE_FPU))
set_thread_flag(TIF_NEED_FPU_LOAD);
}

switch_fpu_prepare

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
// https://elixir.bootlin.com/linux/v5.15-rc6/source/arch/x86/include/asm/fpu/internal.h#L508
/*
* FPU state switching for scheduling.
*
* This is a two-stage process:
*
* - switch_fpu_prepare() saves the old state.
* This is done within the context of the old process.
*
* - switch_fpu_finish() sets TIF_NEED_FPU_LOAD; the floating point state
* will get loaded on return to userspace, or when the kernel needs it.
*
* If TIF_NEED_FPU_LOAD is cleared then the CPU's FPU registers
* are saved in the current thread's FPU register state.
*
* If TIF_NEED_FPU_LOAD is set then CPU's FPU registers may not
* hold current()'s FPU registers. It is required to load the
* registers before returning to userland or using the content
* otherwise.
*
* The FPU context is only stored/restored for a user task and
* PF_KTHREAD is used to distinguish between kernel and user threads.
*/
static inline void switch_fpu_prepare(struct fpu *old_fpu, int cpu)
{
if (static_cpu_has(X86_FEATURE_FPU) && !(current->flags & PF_KTHREAD)) {
save_fpregs_to_fpstate(old_fpu);
/*
* The save operation preserved register state, so the
* fpu_fpregs_owner_ctx is still @old_fpu. Store the
* current CPU number in @old_fpu, so the next return
* to user space can avoid the FPU register restore
* when is returns on the same CPU and still owns the
* context.
*/
old_fpu->last_cpu = cpu;

trace_x86_fpu_regs_deactivated(old_fpu);
}
}

exit to user mode

1
2
3
4
5
6
exit_to_user_mode_prepare
arch_exit_to_user_mode_prepare
switch_fpu_return
fpregs_restore_userregs
__restore_fpregs_from_fpstate
fpregs_activate
1
2
3
4
5
6
7
8
9
syscall_exit_to_user_mode
__syscall_exit_to_user_mode_work
exit_to_user_mode_prepare
arch_exit_to_user_mode_prepare
__exit_to_user_mode


irqentry_exit_to_user_mode
__exit_to_user_mode

fpregs_restore_userregs

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
// https://elixir.bootlin.com/linux/v5.15-rc6/source/arch/x86/include/asm/fpu/internal.h#L456
static inline void fpregs_restore_userregs(void)
{
struct fpu *fpu = &current->thread.fpu;
int cpu = smp_processor_id();

if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
return;

if (!fpregs_state_valid(fpu, cpu)) {
u64 mask;

/*
* This restores _all_ xstate which has not been
* established yet.
*
* If PKRU is enabled, then the PKRU value is already
* correct because it was either set in switch_to() or in
* flush_thread(). So it is excluded because it might be
* not up to date in current->thread.fpu.xsave state.
*/
mask = xfeatures_mask_restore_user() |
xfeatures_mask_supervisor();
__restore_fpregs_from_fpstate(&fpu->state, mask);

fpregs_activate(fpu);
fpu->last_cpu = cpu;
}
clear_thread_flag(TIF_NEED_FPU_LOAD);
}
1
2
3
4
static inline int fpregs_state_valid(struct fpu *fpu, unsigned int cpu)
{
return fpu == this_cpu_read(fpu_fpregs_owner_ctx) && cpu == fpu->last_cpu;
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/*
* Highest level per task FPU state data structure that
* contains the FPU register state plus various FPU
* state fields:
*/
struct fpu {
/*
* @last_cpu:
*
* Records the last CPU on which this context was loaded into
* FPU registers. (In the lazy-restore case we might be
* able to reuse FPU registers across multiple context switches
* this way, if no intermediate task used the FPU.)
*
* A value of -1 is used to indicate that the FPU state in context
* memory is newer than the FPU state in registers, and that the
* FPU state should be reloaded next time the task is run.
*/
unsigned int last_cpu;

/*
* @avx512_timestamp:
*
* Records the timestamp of AVX512 use during last context switch.
*/
unsigned long avx512_timestamp;

/*
* @state:
*
* In-memory copy of all FPU registers that we save/restore
* over context switches. If the task is using the FPU then
* the registers in the FPU are more recent than this state
* copy. If the task context-switches away then they get
* saved here and represent the FPU state.
*/
union fpregs_state state;
/*
* WARNING: 'state' is dynamically-sized. Do not put
* anything after it here.
*/
};
1
2
3
4
5
fpregs_activate
this_cpu_write(fpu_fpregs_owner_ctx, fpu)

fpregs_deactivate
this_cpu_write(fpu_fpregs_owner_ctx, NULL);

kernel_fpu_begin/kernel_fpu_end

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
kernel_fpu_begin
kernel_fpu_begin_mask(KFPU_MXCSR)
__cpu_invalidate_fpregs_state
__this_cpu_write(fpu_fpregs_owner_ctx, NULL);

#define PF_KTHREAD 0x00200000 /* I am a kernel thread */

void kernel_fpu_begin_mask(unsigned int kfpu_mask)
{
preempt_disable();

WARN_ON_FPU(!irq_fpu_usable());
WARN_ON_FPU(this_cpu_read(in_kernel_fpu));

this_cpu_write(in_kernel_fpu, true);

if (!(current->flags & PF_KTHREAD) &&
!test_thread_flag(TIF_NEED_FPU_LOAD)) {
set_thread_flag(TIF_NEED_FPU_LOAD);
save_fpregs_to_fpstate(&current->thread.fpu);
}
__cpu_invalidate_fpregs_state();

/* Put sane initial values into the control registers. */
if (likely(kfpu_mask & KFPU_MXCSR) && boot_cpu_has(X86_FEATURE_XMM))
ldmxcsr(MXCSR_DEFAULT);

....
}
1
2
3
4
5
6
7
void kernel_fpu_end(void)
{
WARN_ON_FPU(!this_cpu_read(in_kernel_fpu));

this_cpu_write(in_kernel_fpu, false);
preempt_enable();
}