[cpufreq governor]sched governor解析

前有schedutil governor来调节cpu频率，现有sched(精确的讲应该称为schedfreq。。。。) governor来调节cpu频率，现在来讲解它的来龙去脉。

源代码参考AOSP kernel的这个分支：remotes/origin/android-msm-wahoo-4.4-pie，kernel version：4.4.116。源代码下载地址：https://aosp.tuna.tsinghua.edu.cn/kernel/msm.git

使用我的分析的kernel code方式如下：

git clone https://aosp.tuna.tsinghua.edu.cn/kernel/msm.git
git branch -a 查看有什么分支

[cpufreq governor]sched governor解析
git checkout -b remotes/origin/android-msm-wahoo-4.4-pie

正确切换到我所看的kernel 分支，Android 9.0

闲话少扯，进入正题。

按照老样子，先看governor相关的结构体成员变量：

/*降频和升频的最小间隔，可以修改*/
#define THROTTLE_DOWN_NSEC  50000000 /* 50ms default */  
#define THROTTLE_UP_NSEC    500000 /* 500us default */  
  
static DEFINE_PER_CPU(unsigned long, enabled);  
DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);  
/*tunable的参数，对于升频和降频的时间限制，attr_set设置sys接口，可供userspace调节*/
struct gov_tunables {  
    struct gov_attr_set attr_set;  
    unsigned int up_throttle_nsec;  
    unsigned int down_throttle_nsec;  
};  
  
/** 
 * gov_data - per-policy data internal to the governor 
 * @up_throttle: next throttling period expiry if increasing OPP 
 * @down_throttle: next throttling period expiry if decreasing OPP 
 * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP 
 * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP 
 * @task: worker thread for dvfs transition that may block/sleep 
 * @irq_work: callback used to wake up worker thread 
 * @requested_freq: last frequency requested by the sched governor 
 * 
 * struct gov_data is the per-policy cpufreq_sched-specific data structure. A 
 * per-policy instance of it is created when the cpufreq_sched governor receives 
 * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data 
 * member of struct cpufreq_policy. 
 * 
 * Readers of this data must call down_read(policy->rwsem). Writers must 
 * call down_write(policy->rwsem). 
 */  
struct gov_data {  
    ktime_t up_throttle; /*升频的时间节点*/ 
    /*降频的时间节点，是当前时间+门限数值，即为下次降频的最近时间节点*/ 
    ktime_t down_throttle;
    /*tunable参数*/      
    struct gov_tunables *tunables;  
    struct list_head tunables_hook; 
    /*频率修改的进程*/ 
    struct task_struct *task;  
    /*slow adjust freq的worker*/
    struct irq_work irq_work; 
    /*更加capacity数值设定的请求频率，在通过频率table挑选idx并获取对于
     freq_table的freq_value,最后通过dvfs进行频率的调整*/ 
    unsigned int requested_freq;  
};

接着看cpufreq_governor结构体的填充。

static int cpufreq_sched_setup(struct cpufreq_policy *policy,  
                   unsigned int event)  
{  
    switch (event) {  
    case CPUFREQ_GOV_POLICY_INIT:  
        return cpufreq_sched_policy_init(policy);  
    case CPUFREQ_GOV_POLICY_EXIT:  
        return cpufreq_sched_policy_exit(policy);  
    case CPUFREQ_GOV_START:  
        return cpufreq_sched_start(policy);  
    case CPUFREQ_GOV_STOP:  
        return cpufreq_sched_stop(policy);  
    case CPUFREQ_GOV_LIMITS:  
        cpufreq_sched_limits(policy);  
        break;  
    }  
    return 0;  
}  
  
  
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED  
static  
#endif  
struct cpufreq_governor cpufreq_gov_sched = {  
    .name           = "sched",  
    .governor       = cpufreq_sched_setup,  
    .owner          = THIS_MODULE,  
};  
  
static int __init cpufreq_sched_init(void)  
{  
    int cpu;  
  
    for_each_cpu(cpu, cpu_possible_mask)  
        per_cpu(enabled, cpu) = 0;  
    return cpufreq_register_governor(&cpufreq_gov_sched);  
}  
  
/* Try to make this the default governor */  
fs_initcall(cpufreq_sched_init);

可以看到governor名字为“sched”，顾明思议就是更加调度器的某些变量来调节cpu频率的。我们看起governor callback函数的init：

static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)  
{  
    struct gov_data *gd;  
    int cpu;  
    int rc;  
    /*对每个cpu上的cpu_sched_capacity结构体进行初始化为0*/
    for_each_cpu(cpu, policy->cpus)  
        memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,  
               sizeof(struct sched_capacity_reqs));  
    /*为sched governor data分配空间*/
    gd = kzalloc(sizeof(*gd), GFP_KERNEL);  
    if (!gd)  
        return -ENOMEM;  
    /*将sched governor data挂载到cpu policy governor data上，即关联上*/
    policy->governor_data = gd;  
    
    if (!global_tunables) { 
        /*对tunable结构体变量分配空间*/ 
        gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);  
        if (!gd->tunables)  
            goto free_gd;  
        /*设置频率升高的时间限制，也就是升频率间隔不能小于这个间隔*/
        gd->tunables->up_throttle_nsec =  
            policy->cpuinfo.transition_latency ?  
            policy->cpuinfo.transition_latency :  
            THROTTLE_UP_NSEC;
       /*设置频率降低的时间限制*/  
        gd->tunables->down_throttle_nsec =  
            THROTTLE_DOWN_NSEC;  
       /*初始化tunable结构体成员变量的kobject，并产生sys fs*/
        rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,  
                      &tunables_ktype,  
                      get_governor_parent_kobj(policy),  
                      "%s", cpufreq_gov_sched.name);  
        if (rc)  
            goto free_tunables;  
       /*属性设置*/
        gov_attr_set_init(&gd->tunables->attr_set,  
                  &gd->tunables_hook);  
  
        pr_debug("%s: throttle_threshold = %u [ns]\n",  
             __func__, gd->tunables->up_throttle_nsec);  
         
        if (!have_governor_per_policy())  
            global_tunables = gd->tunables;  
    } else {  
        gd->tunables = global_tunables;  
        gov_attr_set_get(&global_tunables->attr_set,  
                 &gd->tunables_hook);  
    }  
    /*再次update，上面那个是否有点多余哈？？*/
    policy->governor_data = gd;  
    if (cpufreq_driver_is_slow()) {  
        cpufreq_driver_slow = true;
        /*cpufreq_driver_slow这个参数有点意思。下面创建thread，wakeup函数为：
        cpufreq_sched_thread，最后会创建kschedfreq:0和kschedfreq:4。
        对于两个cluster的cpu架构*/  
        gd->task = kthread_create(cpufreq_sched_thread, policy,  
                      "kschedfreq:%d",  
                      cpumask_first(policy->related_cpus));  
        if (IS_ERR_OR_NULL(gd->task)) {  
            pr_err("%s: failed to create kschedfreq thread\n",  
                   __func__);  
            goto free_tunables;  
        }  
        get_task_struct(gd->task); 
        /*绑定相关联的cpu*/ 
        kthread_bind_mask(gd->task, policy->related_cpus);  
        wake_up_process(gd->task);
        /*初始化irq_work*/  
        init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);  
    }  
  
    set_sched_freq();  
  
    return 0;  
  
free_tunables:  
    kfree(gd->tunables);  
free_gd:  
    policy->governor_data = NULL;  
    kfree(gd);  
    return -ENOMEM;  
}

我们能够看到，上面最重要的信息如下：

频率升高的时间限制
频率下降的时间限制
创建的thread，cpufreq_sched_thread，频率调节的进程
初始化一个irq_work，callback函数为cpufreq_sched_irq_work，最后还是wakeupgd->task，callback cpufreq_sched_thread这个函数。

接下来看一下，cpufreq_sched_thread这个函数的实现过程：

/* 
 * we pass in struct cpufreq_policy. This is safe because changing out the 
 * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), 
 * which tears down all of the data structures and __cpufreq_governor(policy, 
 * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the 
 * new policy pointer 
 */  
static int cpufreq_sched_thread(void *data)  
{  
    struct sched_param param;  
    struct cpufreq_policy *policy;  
    struct gov_data *gd;  
    unsigned int new_request = 0;  
    unsigned int last_request = 0;  
    int ret;  
    /*获取当前cpufreq_policy*/
    policy = (struct cpufreq_policy *) data;  
    /*获取sched governor data*/
    gd = policy->governor_data;  
  
    param.sched_priority = 50;  
    ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);  
    if (ret) {  
        pr_warn("%s: failed to set SCHED_FIFO\n", __func__);  
        do_exit(-EINVAL);  
    } else {  
        pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",  
                __func__, gd->task->pid);  
    }  
  
    do {  
        /*governor请求的频率*/
        new_request = gd->requested_freq; 
        /*如果频率一致，则进程进入TASK_INTERRUPTIBLE状态并sleep*/ 
        if (new_request == last_request) {  
            set_current_state(TASK_INTERRUPTIBLE);  
            if (kthread_should_stop())  
                break;  
            schedule();  /*放弃cpu的运行，sleep*/
        } else {  
            /* 
             * if the frequency thread sleeps while waiting to be 
             * unthrottled, start over to check for a newer request 
             */  
            /*是否最后启动频率请求，*/
            if (finish_last_request(gd, policy->cur))  
                continue;  
            last_request = new_request;
            /*update 升和降频率的时间=当前时间+升/降频率时间间隔，同时调用DVFS
            进行频率的调节*/  
            cpufreq_sched_try_driver_target(policy, new_request);  
        }  
    } while (!kthread_should_stop());  
  
    return 0;  
}

我们来看一下finish_last_request函数的实现：

static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)  
{  
    ktime_t now = ktime_get();  
  
    ktime_t throttle = gd->requested_freq < cur_freq ?  
        gd->down_throttle : gd->up_throttle;  
    /*当前时间与下个周期频率变化的时间，如果现在时间已经超过本应该频率调节的时间
    节点，那么就必须进行频率调节了*/
    if (ktime_after(now, throttle))  
        return false;  
  
    while (1) {  
        /*由于throtle > now*/
        int usec_left = ktime_to_ns(ktime_sub(throttle, now));  
         /*将差值转换为us*/
        usec_left /= NSEC_PER_USEC;  
        trace_cpufreq_sched_throttled(usec_left);  
        /*休眠[usec_left,usec_left+100]这个时间间隔*/
        usleep_range(usec_left, usec_left + 100);  
        now = ktime_get();   /*当前时间*/
        /*再次比较当前时间与throttle的差值，如果now + sleep 时间 > throttle,
         则在thread里面重新计算是否需要频率调整。即休眠[usec_left,usec_left+100]
          这个时间段之后，判断finish_last_update为false。。    */
        if (ktime_after(now, throttle))  
            return true;  
    }  
}

如果没有外部触发这个thread，最后last_request = new_request一直会相等，导致频率不会update，那么肯定还有调度算法来trigger governor来调节频率：

void update_cpu_capacity_request(int cpu, bool request)  
{  
    unsigned long new_capacity;  
    struct sched_capacity_reqs *scr;  
  
    /* The rq lock serializes access to the CPU's sched_capacity_reqs. */  
    lockdep_assert_held(&cpu_rq(cpu)->lock);  
    /*获取当前cpu的sche_capacity_reqs数据结果数据*/
    scr = &per_cpu(cpu_sched_capacity_reqs, cpu);  
    /*capacity受cfs和rt task的影响*/
    new_capacity = scr->cfs + scr->rt; 
    /*将capacity增加10%*/ 
    new_capacity = new_capacity * capacity_margin  
        / SCHED_CAPACITY_SCALE;  
    new_capacity += scr->dl;  
    /*如果当前cpu的需要调整的capacity与源total一致，没有必要频率调节了*/
    if (new_capacity == scr->total)  
        return;  
  
    trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);  
    /*更新源total数据为最新数据*/
    scr->total = new_capacity;
    /*如果需要频率调整，则调用下面函数进行调整*/  
    if (request)  
        update_fdomain_capacity_request(cpu);  
}

看看update_fdomain_capacity_request函数的实现原理（比较简单）：

static void update_fdomain_capacity_request(int cpu)  
{  
    unsigned int freq_new, index_new, cpu_tmp;  
    struct cpufreq_policy *policy;  
    struct gov_data *gd;  
    unsigned long capacity = 0;  
  
    /* 
     * Avoid grabbing the policy if possible. A test is still 
     * required after locking the CPU's policy to avoid racing 
     * with the governor changing. 
     *//*是否启用sched governor*/  
    if (!per_cpu(enabled, cpu))  
        return;  
    /*获取当前cpu的cpufreq_policy*/
    policy = cpufreq_cpu_get(cpu);  
    if (IS_ERR_OR_NULL(policy))  
        return;  
  
    if (policy->governor != &cpufreq_gov_sched ||  
        !policy->governor_data)  
        goto out;  
    /*获取当前sched governor data结构体*/
    gd = policy->governor_data;  
    /*对同一个policy的cpu找出最大的capacity数值*/
    /* find max capacity requested by cpus in this policy */  
    for_each_cpu(cpu_tmp, policy->cpus) {  
        struct sched_capacity_reqs *scr;  
  
        scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);  
        capacity = max(capacity, scr->total);  
    }  
    /*使用capacity容量归一化最高频率*/
    /* Convert the new maximum capacity request into a cpu frequency */  
    freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; 
    /*找出符合freq_new频率的index_new索引，即频率table索引号*/ 
    if (cpufreq_frequency_table_target(policy, policy->freq_table,  
                       freq_new, CPUFREQ_RELATION_L,  
                       &index_new))  
        goto out;  
    /*索引号在table里面对应的频率*/
    freq_new = policy->freq_table[index_new].frequency;  
    /*频率校正*/
    if (freq_new > policy->max)  
        freq_new = policy->max;  
  
    if (freq_new < policy->min)  
        freq_new = policy->min;  
  
    trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,  
                    gd->requested_freq);  
    if (freq_new == gd->requested_freq)  
        goto out;  
    /*更新governor data结构体请求的频率参数*/
    gd->requested_freq = freq_new;  
  
    /* 
     * Throttling is not yet supported on platforms with fast cpufreq 
     * drivers. 
     */  
    if (cpufreq_driver_slow)
        /*触发频率调节*/  
        irq_work_queue_on(&gd->irq_work, cpu);  
    else  
        cpufreq_sched_try_driver_target(policy, freq_new);  
  
out:  
    cpufreq_cpu_put(policy);  
}

最后都是执行到如下函数中：

static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,  
                        unsigned int freq)  
{  
    struct gov_data *gd = policy->governor_data;  
  
    /* avoid race with cpufreq_sched_stop */  
    if (!down_write_trylock(&policy->rwsem))  
        return;  
   /*频率更新*/
    __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);  
    /*更新上升和下降频率的下个频率调节时间节点*/
    gd->up_throttle = ktime_add_ns(ktime_get(),  
                       gd->tunables->up_throttle_nsec);  
    gd->down_throttle = ktime_add_ns(ktime_get(),  
                     gd->tunables->down_throttle_nsec);  
    up_write(&policy->rwsem);  
}

至此完毕，比较简单。关键还是这个函数update_cpu_capacity_request在哪里调用的。调用关系方框图如下：

[cpufreq governor]sched governor解析

对于sched governor和schedutil governor哪个更好，还没有详细测试过，目前不好下定论，以后再来对比下。

最后，对于现存的cpufreq governor全部分析完毕：

powersave
performance
userspace
ondemand
conservative
interactive
schedutil (https://blog.csdn.net/wukongmingjing/article/details/81784727)
sched

跨度四年了，呵呵，人生能够有几个这样的年月呢，加油吧，中年老男人！！！

[cpufreq governor]sched governor解析

继续阅读

使用Scheduling Framework扩展kube-scheduler

[cpufreq governor] schedutil governor解析

dhtmlxscheduler使用

[scheduler]三. CFS调度算法基本思想

Linux Kernel调度器学习路线图

k8s基础(13)之scheduler调度器k8s基础(13)之scheduler调度器

调度作业(Oracle Scheduler)

Quartz源码解析 ---- 触发器按时启动原理

Oracle 10g Scheduler 特性 Oracle 10g Scheduler 特性

linux CFS Scheduler

初识Quartz之Scheduler组件

K8S单master部署三：APIserver+Controller-Manager+Scheduler未完待续~~~

Java实现日程图（Samples）功能

openstack动态调度现状

Rxjava学习(三线程调度器Scheduler)

RxJava学习篇之一：基础Observable,Observer,Single,Subject,Scheduler