天天看点

[cpufreq governor]sched governor解析

前有schedutil governor来调节cpu频率,现有sched(精确的讲应该称为schedfreq。。。。) governor来调节cpu频率,现在来讲解它的来龙去脉。

源代码参考AOSP kernel的这个分支:remotes/origin/android-msm-wahoo-4.4-pie,kernel version:4.4.116。源代码下载地址:https://aosp.tuna.tsinghua.edu.cn/kernel/msm.git

使用我的分析的kernel code方式如下:

  • git clone https://aosp.tuna.tsinghua.edu.cn/kernel/msm.git
  • git branch -a 查看有什么分支
    [cpufreq governor]sched governor解析
  • git checkout -b remotes/origin/android-msm-wahoo-4.4-pie

正确切换到我所看的kernel 分支,Android 9.0

闲话少扯,进入正题。

按照老样子,先看governor相关的结构体成员变量:

/*降频和升频的最小间隔,可以修改*/
#define THROTTLE_DOWN_NSEC  50000000 /* 50ms default */  
#define THROTTLE_UP_NSEC    500000 /* 500us default */  
  
static DEFINE_PER_CPU(unsigned long, enabled);  
DEFINE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs);  
/*tunable的参数,对于升频和降频的时间限制,attr_set设置sys接口,可供userspace调节*/
struct gov_tunables {  
    struct gov_attr_set attr_set;  
    unsigned int up_throttle_nsec;  
    unsigned int down_throttle_nsec;  
};  
  
/** 
 * gov_data - per-policy data internal to the governor 
 * @up_throttle: next throttling period expiry if increasing OPP 
 * @down_throttle: next throttling period expiry if decreasing OPP 
 * @up_throttle_nsec: throttle period length in nanoseconds if increasing OPP 
 * @down_throttle_nsec: throttle period length in nanoseconds if decreasing OPP 
 * @task: worker thread for dvfs transition that may block/sleep 
 * @irq_work: callback used to wake up worker thread 
 * @requested_freq: last frequency requested by the sched governor 
 * 
 * struct gov_data is the per-policy cpufreq_sched-specific data structure. A 
 * per-policy instance of it is created when the cpufreq_sched governor receives 
 * the CPUFREQ_GOV_START condition and a pointer to it exists in the gov_data 
 * member of struct cpufreq_policy. 
 * 
 * Readers of this data must call down_read(policy->rwsem). Writers must 
 * call down_write(policy->rwsem). 
 */  
struct gov_data {  
    ktime_t up_throttle; /*升频的时间节点*/ 
    /*降频的时间节点,是当前时间+门限数值,即为下次降频的最近时间节点*/ 
    ktime_t down_throttle;
    /*tunable参数*/      
    struct gov_tunables *tunables;  
    struct list_head tunables_hook; 
    /*频率修改的进程*/ 
    struct task_struct *task;  
    /*slow adjust freq的worker*/
    struct irq_work irq_work; 
    /*更加capacity数值设定的请求频率,在通过频率table挑选idx并获取对于
     freq_table的freq_value,最后通过dvfs进行频率的调整*/ 
    unsigned int requested_freq;  
}; 

           

接着看cpufreq_governor结构体的填充。

static int cpufreq_sched_setup(struct cpufreq_policy *policy,  
                   unsigned int event)  
{  
    switch (event) {  
    case CPUFREQ_GOV_POLICY_INIT:  
        return cpufreq_sched_policy_init(policy);  
    case CPUFREQ_GOV_POLICY_EXIT:  
        return cpufreq_sched_policy_exit(policy);  
    case CPUFREQ_GOV_START:  
        return cpufreq_sched_start(policy);  
    case CPUFREQ_GOV_STOP:  
        return cpufreq_sched_stop(policy);  
    case CPUFREQ_GOV_LIMITS:  
        cpufreq_sched_limits(policy);  
        break;  
    }  
    return 0;  
}  
  
  
#ifndef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHED  
static  
#endif  
struct cpufreq_governor cpufreq_gov_sched = {  
    .name           = "sched",  
    .governor       = cpufreq_sched_setup,  
    .owner          = THIS_MODULE,  
};  
  
static int __init cpufreq_sched_init(void)  
{  
    int cpu;  
  
    for_each_cpu(cpu, cpu_possible_mask)  
        per_cpu(enabled, cpu) = 0;  
    return cpufreq_register_governor(&cpufreq_gov_sched);  
}  
  
/* Try to make this the default governor */  
fs_initcall(cpufreq_sched_init);  

           

可以看到governor名字为“sched”,顾明思议就是更加调度器的某些变量来调节cpu频率的。我们看起governor callback函数的init:

static int cpufreq_sched_policy_init(struct cpufreq_policy *policy)  
{  
    struct gov_data *gd;  
    int cpu;  
    int rc;  
    /*对每个cpu上的cpu_sched_capacity结构体进行初始化为0*/
    for_each_cpu(cpu, policy->cpus)  
        memset(&per_cpu(cpu_sched_capacity_reqs, cpu), 0,  
               sizeof(struct sched_capacity_reqs));  
    /*为sched governor data分配空间*/
    gd = kzalloc(sizeof(*gd), GFP_KERNEL);  
    if (!gd)  
        return -ENOMEM;  
    /*将sched governor data挂载到cpu policy governor data上,即关联上*/
    policy->governor_data = gd;  
    
    if (!global_tunables) { 
        /*对tunable结构体变量分配空间*/ 
        gd->tunables = kzalloc(sizeof(*gd->tunables), GFP_KERNEL);  
        if (!gd->tunables)  
            goto free_gd;  
        /*设置频率升高的时间限制,也就是升频率间隔不能小于这个间隔*/
        gd->tunables->up_throttle_nsec =  
            policy->cpuinfo.transition_latency ?  
            policy->cpuinfo.transition_latency :  
            THROTTLE_UP_NSEC;
       /*设置频率降低的时间限制*/  
        gd->tunables->down_throttle_nsec =  
            THROTTLE_DOWN_NSEC;  
       /*初始化tunable结构体成员变量的kobject,并产生sys fs*/
        rc = kobject_init_and_add(&gd->tunables->attr_set.kobj,  
                      &tunables_ktype,  
                      get_governor_parent_kobj(policy),  
                      "%s", cpufreq_gov_sched.name);  
        if (rc)  
            goto free_tunables;  
       /*属性设置*/
        gov_attr_set_init(&gd->tunables->attr_set,  
                  &gd->tunables_hook);  
  
        pr_debug("%s: throttle_threshold = %u [ns]\n",  
             __func__, gd->tunables->up_throttle_nsec);  
         
        if (!have_governor_per_policy())  
            global_tunables = gd->tunables;  
    } else {  
        gd->tunables = global_tunables;  
        gov_attr_set_get(&global_tunables->attr_set,  
                 &gd->tunables_hook);  
    }  
    /*再次update,上面那个是否有点多余哈??*/
    policy->governor_data = gd;  
    if (cpufreq_driver_is_slow()) {  
        cpufreq_driver_slow = true;
        /*cpufreq_driver_slow这个参数有点意思。下面创建thread,wakeup函数为:
        cpufreq_sched_thread,最后会创建kschedfreq:0和kschedfreq:4。
        对于两个cluster的cpu架构*/  
        gd->task = kthread_create(cpufreq_sched_thread, policy,  
                      "kschedfreq:%d",  
                      cpumask_first(policy->related_cpus));  
        if (IS_ERR_OR_NULL(gd->task)) {  
            pr_err("%s: failed to create kschedfreq thread\n",  
                   __func__);  
            goto free_tunables;  
        }  
        get_task_struct(gd->task); 
        /*绑定相关联的cpu*/ 
        kthread_bind_mask(gd->task, policy->related_cpus);  
        wake_up_process(gd->task);
        /*初始化irq_work*/  
        init_irq_work(&gd->irq_work, cpufreq_sched_irq_work);  
    }  
  
    set_sched_freq();  
  
    return 0;  
  
free_tunables:  
    kfree(gd->tunables);  
free_gd:  
    policy->governor_data = NULL;  
    kfree(gd);  
    return -ENOMEM;  
}  

           

我们能够看到,上面最重要的信息如下:

  • 频率升高的时间限制
  • 频率下降的时间限制
  • 创建的thread,cpufreq_sched_thread,频率调节的进程
  • 初始化一个irq_work,callback函数为cpufreq_sched_irq_work,最后还是wakeupgd->task,callback cpufreq_sched_thread这个函数。

接下来看一下,cpufreq_sched_thread这个函数的实现过程:

/* 
 * we pass in struct cpufreq_policy. This is safe because changing out the 
 * policy requires a call to __cpufreq_governor(policy, CPUFREQ_GOV_STOP), 
 * which tears down all of the data structures and __cpufreq_governor(policy, 
 * CPUFREQ_GOV_START) will do a full rebuild, including this kthread with the 
 * new policy pointer 
 */  
static int cpufreq_sched_thread(void *data)  
{  
    struct sched_param param;  
    struct cpufreq_policy *policy;  
    struct gov_data *gd;  
    unsigned int new_request = 0;  
    unsigned int last_request = 0;  
    int ret;  
    /*获取当前cpufreq_policy*/
    policy = (struct cpufreq_policy *) data;  
    /*获取sched governor data*/
    gd = policy->governor_data;  
  
    param.sched_priority = 50;  
    ret = sched_setscheduler_nocheck(gd->task, SCHED_FIFO, ¶m);  
    if (ret) {  
        pr_warn("%s: failed to set SCHED_FIFO\n", __func__);  
        do_exit(-EINVAL);  
    } else {  
        pr_debug("%s: kthread (%d) set to SCHED_FIFO\n",  
                __func__, gd->task->pid);  
    }  
  
    do {  
        /*governor请求的频率*/
        new_request = gd->requested_freq; 
        /*如果频率一致,则进程进入TASK_INTERRUPTIBLE状态并sleep*/ 
        if (new_request == last_request) {  
            set_current_state(TASK_INTERRUPTIBLE);  
            if (kthread_should_stop())  
                break;  
            schedule();  /*放弃cpu的运行,sleep*/
        } else {  
            /* 
             * if the frequency thread sleeps while waiting to be 
             * unthrottled, start over to check for a newer request 
             */  
            /*是否最后启动频率请求,*/
            if (finish_last_request(gd, policy->cur))  
                continue;  
            last_request = new_request;
            /*update 升和降频率的时间=当前时间+升/降频率时间间隔,同时调用DVFS
            进行频率的调节*/  
            cpufreq_sched_try_driver_target(policy, new_request);  
        }  
    } while (!kthread_should_stop());  
  
    return 0;  
}  

           

我们来看一下finish_last_request函数的实现:

static bool finish_last_request(struct gov_data *gd, unsigned int cur_freq)  
{  
    ktime_t now = ktime_get();  
  
    ktime_t throttle = gd->requested_freq < cur_freq ?  
        gd->down_throttle : gd->up_throttle;  
    /*当前时间与下个周期频率变化的时间,如果现在时间已经超过本应该频率调节的时间
    节点,那么就必须进行频率调节了*/
    if (ktime_after(now, throttle))  
        return false;  
  
    while (1) {  
        /*由于throtle > now*/
        int usec_left = ktime_to_ns(ktime_sub(throttle, now));  
         /*将差值转换为us*/
        usec_left /= NSEC_PER_USEC;  
        trace_cpufreq_sched_throttled(usec_left);  
        /*休眠[usec_left,usec_left+100]这个时间间隔*/
        usleep_range(usec_left, usec_left + 100);  
        now = ktime_get();   /*当前时间*/
        /*再次比较当前时间与throttle的差值,如果now + sleep 时间 > throttle,
         则在thread里面重新计算是否需要频率调整。即休眠[usec_left,usec_left+100]
          这个时间段之后,判断finish_last_update为false。。    */
        if (ktime_after(now, throttle))  
            return true;  
    }  
}  

           

如果没有外部触发这个thread,最后last_request = new_request一直会相等,导致频率不会update,那么肯定还有调度算法来trigger governor来调节频率:

void update_cpu_capacity_request(int cpu, bool request)  
{  
    unsigned long new_capacity;  
    struct sched_capacity_reqs *scr;  
  
    /* The rq lock serializes access to the CPU's sched_capacity_reqs. */  
    lockdep_assert_held(&cpu_rq(cpu)->lock);  
    /*获取当前cpu的sche_capacity_reqs数据结果数据*/
    scr = &per_cpu(cpu_sched_capacity_reqs, cpu);  
    /*capacity受cfs和rt task的影响*/
    new_capacity = scr->cfs + scr->rt; 
    /*将capacity增加10%*/ 
    new_capacity = new_capacity * capacity_margin  
        / SCHED_CAPACITY_SCALE;  
    new_capacity += scr->dl;  
    /*如果当前cpu的需要调整的capacity与源total一致,没有必要频率调节了*/
    if (new_capacity == scr->total)  
        return;  
  
    trace_cpufreq_sched_update_capacity(cpu, request, scr, new_capacity);  
    /*更新源total数据为最新数据*/
    scr->total = new_capacity;
    /*如果需要频率调整,则调用下面函数进行调整*/  
    if (request)  
        update_fdomain_capacity_request(cpu);  
}  

           

看看update_fdomain_capacity_request函数的实现原理(比较简单):

static void update_fdomain_capacity_request(int cpu)  
{  
    unsigned int freq_new, index_new, cpu_tmp;  
    struct cpufreq_policy *policy;  
    struct gov_data *gd;  
    unsigned long capacity = 0;  
  
    /* 
     * Avoid grabbing the policy if possible. A test is still 
     * required after locking the CPU's policy to avoid racing 
     * with the governor changing. 
     *//*是否启用sched governor*/  
    if (!per_cpu(enabled, cpu))  
        return;  
    /*获取当前cpu的cpufreq_policy*/
    policy = cpufreq_cpu_get(cpu);  
    if (IS_ERR_OR_NULL(policy))  
        return;  
  
    if (policy->governor != &cpufreq_gov_sched ||  
        !policy->governor_data)  
        goto out;  
    /*获取当前sched governor data结构体*/
    gd = policy->governor_data;  
    /*对同一个policy的cpu找出最大的capacity数值*/
    /* find max capacity requested by cpus in this policy */  
    for_each_cpu(cpu_tmp, policy->cpus) {  
        struct sched_capacity_reqs *scr;  
  
        scr = &per_cpu(cpu_sched_capacity_reqs, cpu_tmp);  
        capacity = max(capacity, scr->total);  
    }  
    /*使用capacity容量归一化最高频率*/
    /* Convert the new maximum capacity request into a cpu frequency */  
    freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; 
    /*找出符合freq_new频率的index_new索引,即频率table索引号*/ 
    if (cpufreq_frequency_table_target(policy, policy->freq_table,  
                       freq_new, CPUFREQ_RELATION_L,  
                       &index_new))  
        goto out;  
    /*索引号在table里面对应的频率*/
    freq_new = policy->freq_table[index_new].frequency;  
    /*频率校正*/
    if (freq_new > policy->max)  
        freq_new = policy->max;  
  
    if (freq_new < policy->min)  
        freq_new = policy->min;  
  
    trace_cpufreq_sched_request_opp(cpu, capacity, freq_new,  
                    gd->requested_freq);  
    if (freq_new == gd->requested_freq)  
        goto out;  
    /*更新governor data结构体请求的频率参数*/
    gd->requested_freq = freq_new;  
  
    /* 
     * Throttling is not yet supported on platforms with fast cpufreq 
     * drivers. 
     */  
    if (cpufreq_driver_slow)
        /*触发频率调节*/  
        irq_work_queue_on(&gd->irq_work, cpu);  
    else  
        cpufreq_sched_try_driver_target(policy, freq_new);  
  
out:  
    cpufreq_cpu_put(policy);  
}  

           

最后都是执行到如下函数中:

static void cpufreq_sched_try_driver_target(struct cpufreq_policy *policy,  
                        unsigned int freq)  
{  
    struct gov_data *gd = policy->governor_data;  
  
    /* avoid race with cpufreq_sched_stop */  
    if (!down_write_trylock(&policy->rwsem))  
        return;  
   /*频率更新*/
    __cpufreq_driver_target(policy, freq, CPUFREQ_RELATION_L);  
    /*更新上升和下降频率的下个频率调节时间节点*/
    gd->up_throttle = ktime_add_ns(ktime_get(),  
                       gd->tunables->up_throttle_nsec);  
    gd->down_throttle = ktime_add_ns(ktime_get(),  
                     gd->tunables->down_throttle_nsec);  
    up_write(&policy->rwsem);  
}  

           

至此完毕,比较简单。关键还是这个函数update_cpu_capacity_request在哪里调用的。调用关系方框图如下:

[cpufreq governor]sched governor解析

对于sched governor和schedutil governor哪个更好,还没有详细测试过,目前不好下定论,以后再来对比下。

最后,对于现存的cpufreq governor全部分析完毕:

  1. powersave
  2. performance
  3. userspace
  4. ondemand
  5. conservative
  6. interactive
  7. schedutil (https://blog.csdn.net/wukongmingjing/article/details/81784727)
  8. sched

跨度四年了,呵呵,人生能够有几个这样的年月呢,加油吧,中年老男人!!!

继续阅读