Download
Getting Started
Members
Projects
Community
Marketplace
Events
Planet Eclipse
Newsletter
Videos
Participate
Report a Bug
Forums
Mailing Lists
Wiki
IRC
How to Contribute
Working Groups
Automotive
Internet of Things
LocationTech
Long-Term Support
PolarSys
Science
OpenMDM
More
Community
Marketplace
Events
Planet Eclipse
Newsletter
Videos
Participate
Report a Bug
Forums
Mailing Lists
Wiki
IRC
How to Contribute
Working Groups
Automotive
Internet of Things
LocationTech
Long-Term Support
PolarSys
Science
OpenMDM
Toggle navigation
Bugzilla – Attachment 12973 Details for
Bug 69330
Outline keeps getting stuck in a seemingly infinite loop
Home
|
New
|
Browse
|
Search
|
[?]
|
Reports
|
Requests
|
Help
|
Log In
[x]
|
Terms of Use
|
Copyright Agent
This is a file for which the reported behaviour was observed
sched.c (text/plain), 119.49 KB, created by
Peter Williams
on 2004-07-05 20:18:26 EDT
(
hide
)
Description:
This is a file for which the reported behaviour was observed
Filename:
MIME Type:
Creator:
Peter Williams
Created:
2004-07-05 20:18:26 EDT
Size:
119.49 KB
patch
obsolete
>/* > * kernel/sched.c > * > * Kernel scheduler and related syscalls > * > * Copyright (C) 1991-2002 Linus Torvalds > * > * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and > * make semaphores SMP safe > * 1998-11-19 Implemented schedule_timeout() and related stuff > * by Andrea Arcangeli > * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: > * hybrid priority-list and round-robin design with > * an array-switch method of distributing timeslices > * and per-CPU runqueues. Cleanups and useful suggestions > * by Davide Libenzi, preemptible kernel bits by Robert Love. > * 2003-09-03 Interactivity tuning by Con Kolivas. > * 2004-04-02 Scheduler domains code by Nick Piggin > * 2004-06-11 New staircase scheduling policy by Con Kolivas with help > * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. > * 2004-06-03 Single priority array, simplified interactive bonus > * mechanism and throughput bonus mechanism by Peter Williams > * (Courtesy of Aurema Pty Ltd, www.aurema.com) > */ > >#include <linux/mm.h> >#include <linux/module.h> >#include <linux/nmi.h> >#include <linux/init.h> >#include <asm/uaccess.h> >#include <linux/highmem.h> >#include <linux/smp_lock.h> >#include <asm/mmu_context.h> >#include <linux/interrupt.h> >#include <linux/completion.h> >#include <linux/kernel_stat.h> >#include <linux/security.h> >#include <linux/notifier.h> >#include <linux/suspend.h> >#include <linux/blkdev.h> >#include <linux/delay.h> >#include <linux/smp.h> >#include <linux/timer.h> >#include <linux/rcupdate.h> >#include <linux/cpu.h> >#include <linux/percpu.h> >#include <linux/kthread.h> > >#include <asm/unistd.h> > >enum sched_mode_enum { > SCHED_MODE_STAIRCASE, > SCHED_MODE_PRIORITY_BASED, > SCHED_MODE_ENTITLEMENT_BASED >}; > >static enum sched_mode_enum sched_mode = SCHED_MODE_STAIRCASE; > >#ifdef CONFIG_SYSCTL >static const char *sched_mode_names[] = { > "sc", /* SCHED_MODE_STAIRCASE */ > "pb", /* SCHED_MODE_PRIORITY_BASED */ > "eb", /* SCHED_MODE_ENTITLEMENT_BASED */ > NULL /* end of list marker */ >}; >#endif > >/* > * Convert user-nice values [ -20 ... 0 ... 19 ] > * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], > * and back. > */ >#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) >#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) >#define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) > >/* > * 'User priority' is the nice value converted to something we > * can work with better when scaling various scheduler parameters, > * it's a [ 0 ... 39 ] range. > */ >#define USER_PRIO(p) ((p)-MAX_RT_PRIO) >#define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) >#define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) > >/* > * Some helpers for converting nanosecond timing to jiffy resolution > */ >#define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) > >static int sched_compute = 0; >/* > *This is the time all tasks within the same priority round robin. > *compute setting is reserved for dedicated computational scheduling > *and has ten times larger intervals. > */ >#define _RR_INTERVAL ((10 * HZ / 1000) ? : 1) >#define RR_INTERVAL() (_RR_INTERVAL * (1 + 9 * sched_compute)) > >/* > * These are the 'tuning knobs' of the scheduler: > * Making MAX_TOTAL_BONUS bigger than 19 causes mysterious crashes during boot > * this causes the number of longs in the bitmap to increase from 5 to 6 > * and that's a limit on bit map size P.W. > */ >#define MAX_TOTAL_BONUS 19 >#define MAX_MAX_IA_BONUS 10 >#define MAX_MAX_TPT_BONUS (MAX_TOTAL_BONUS - MAX_MAX_IA_BONUS) >#define DEFAULT_MAX_IA_BONUS MAX_MAX_IA_BONUS >#define DEFAULT_MAX_TPT_BONUS ((DEFAULT_MAX_IA_BONUS) / 2) >static unsigned int max_ia_bonus = DEFAULT_MAX_IA_BONUS; >static unsigned int initial_ia_bonus = 1; >static unsigned int max_tpt_bonus = DEFAULT_MAX_TPT_BONUS; > >/* > * Define some mini Kalman filter for estimating various averages, etc. > * To make it more efficient the denominator of the fixed point rational > * numbers used to store the averages and the response half life will > * be chosen so that the fixed point rational number reperesentation > * of (1 - alpha) * i (where i is an integer) will be i. > * Some of this is defined in linux/sched.h > */ > >/* > * Fixed denominator rational numbers for use by the CPU scheduler > */ >#define SCHED_AVG_OFFSET 4 >/* > * Get the rounded integer value of a scheduling statistic average field > * i.e. those fields whose names begin with avg_ > */ >#define SCHED_AVG_RND(x) \ > (((x) + (1 << (SCHED_AVG_OFFSET - 1))) >> (SCHED_AVG_OFFSET)) >#define SCHED_AVG_ALPHA ((1 << SCHED_AVG_OFFSET) - 1) >#define SCHED_AVG_MUL(a, b) (((a) * (b)) >> SCHED_AVG_OFFSET) >#define SCHED_AVG_REAL(a) ((a) << SCHED_AVG_OFFSET) > >/* > * Convert nice to shares > * Proportional symmetry is aimed for: i.e. > * (nice_to_shares(0) / nice_to_shares(19)) == (nice_to_shares(-20) / nice_to_shares(0)) > * Make sure that this function is robust for variations of EB_SHARES_PER_NICE > */ >static inline unsigned int nice_to_shares(int nice) >{ > unsigned int result = DEFAULT_EB_SHARES; > > if (nice > 0) > result -= (nice * (20 * EB_SHARES_PER_NICE - 1)) / 19; > else if (nice < 0) > result += (nice * nice * ((20 * EB_SHARES_PER_NICE - 1) * EB_SHARES_PER_NICE)) / 20; > > return result; >} > >#define SCHED_IA_BONUS_OFFSET 8 >#define SCHED_IA_BONUS_ALPHA ((1 << SCHED_IA_BONUS_OFFSET) - 1) >#define SCHED_IA_BONUS_MUL(a, b) (((a) * (b)) >> SCHED_IA_BONUS_OFFSET) >/* > * Get the rounded integer value of the interactive bonus > */ >#define SCHED_IA_BONUS_RND(x) \ > (((x) + (1 << (SCHED_IA_BONUS_OFFSET - 1))) >> (SCHED_IA_BONUS_OFFSET)) > >static inline void apply_sched_avg_decay(unsigned long long *valp) >{ > *valp = SCHED_AVG_MUL(*valp, SCHED_AVG_ALPHA); >} > >static inline void update_sched_ia_bonus(struct task_struct *p, unsigned long long incr) >{ > p->interactive_bonus = SCHED_AVG_MUL(p->interactive_bonus, SCHED_AVG_ALPHA); > p->interactive_bonus += incr; >} > >static inline unsigned long long sched_div_64(unsigned long long a, unsigned long long b) >{ >#if BITS_PER_LONG < 64 > /* > * Assume that there's no 64 bit divide available > */ > if (a < b) > return 0; > /* > * Scale down until b less than 32 bits so that we can do > * a divide using do_div() > */ > while (b > ULONG_MAX) { a >>= 1; b >>= 1; } > > (void)do_div(a, (unsigned long)b); > > return a; >#else > return a / b; >#endif >} > >#define PROPORTION_OFFSET 32 >#define PROPORTION_ONE ((unsigned long long)1 << PROPORTION_OFFSET) >#define PROPORTION_OVERFLOW (((unsigned long long)1 << (64 - PROPORTION_OFFSET)) - 1) >#define PROP_FM_PPT(a) (((unsigned long long)(a) * PROPORTION_ONE) / 1000) >/* > * Convert a / b to a proportion in the range 0 to PROPORTION_ONE > * Requires a <= b or may get a divide by zero exception > */ >static inline unsigned long long calc_proportion(unsigned long long a, unsigned long long b) >{ > if (unlikely(a == b)) > return PROPORTION_ONE; > > while (a > PROPORTION_OVERFLOW) { a >>= 1; b >>= 1; } > > return sched_div_64(a << PROPORTION_OFFSET, b); >} > >/* > * Map the given proportion to an unsigned long long in the specified range > * Requires range < PROPORTION_ONE to avoid overflow > */ >static inline unsigned long long map_proportion(unsigned long long prop, unsigned long long range) >{ > return (prop * range) >> PROPORTION_OFFSET; >} > >static inline unsigned long long map_proportion_rnd(unsigned long long prop, unsigned long long range) >{ > return map_proportion((prop >> 1), (range * 2 + 1)); >} > >/* > * Tasks that have a CPU usage rate greater than this threshold (in parts per > * thousand) are considered to be CPU bound and start to lose interactive bonus > * points > */ >#define DEFAULT_CPU_HOG_THRESHOLD 900 >static unsigned int cpu_hog_threshold_ppt = DEFAULT_CPU_HOG_THRESHOLD; >static unsigned long long cpu_hog_threshold = PROP_FM_PPT(DEFAULT_CPU_HOG_THRESHOLD); > >/* > * Tasks that would sleep for more than 900 parts per thousand of the time if > * they had the CPU to themselves are considered to be interactive provided > * that their average sleep duration per scheduling cycle isn't too long > */ >#define DEFAULT_IA_THRESHOLD 900 >static unsigned int ia_threshold_ppt = DEFAULT_IA_THRESHOLD; >static unsigned long long ia_threshold = PROP_FM_PPT(DEFAULT_IA_THRESHOLD); >#define LOWER_MAX_IA_SLEEP SCHED_AVG_REAL(15 * 60LL * NSEC_PER_SEC) >#define UPPER_MAX_IA_SLEEP SCHED_AVG_REAL(2 * 60 * 60LL * NSEC_PER_SEC) > >/* > * What "base time slice" for nice 0 and "average time slice" evaluated to > */ >#define MSECS_TO_JIFFIES(x) (((x) * (HZ * 2 + 1)) / 2000) >#define MSECS_TO_JIFFIES_MIN_1(x) (MSECS_TO_JIFFIES(x) ? MSECS_TO_JIFFIES(x) : 1) >#define DEFAULT_TIME_SLICE_MSECS 100 >#define MAX_TIME_SLICE_MSECS 1000 > >static unsigned int time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(DEFAULT_TIME_SLICE_MSECS); > >static unsigned int slice(const task_t *p); >static inline unsigned int task_timeslice(const task_t *p) >{ > if (sched_mode == SCHED_MODE_STAIRCASE) > return slice(p); > > return time_slice_ticks; >} > >#define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) > >/* > * These are the runqueue data structures: > */ >#define IDLE_PRIO (MAX_PRIO + MAX_TOTAL_BONUS) >#define NUM_PRIO_SLOTS (IDLE_PRIO + 1) > >/* > * Is the run queue idle? > */ >#define RUNQUEUE_IDLE(rq) ((rq)->curr == (rq)->idle) > >/* > * Control values for niceness > */ >#define PROSPECTIVE_BASE_PROM_INTERVAL_MSECS ((DEFAULT_TIME_SLICE_MSECS * 110) / 100) >#if (PROSPECTIVE_BASE_PROM_INTERVAL_MSECS > 0) >#define BASE_PROM_INTERVAL_MSECS PROSPECTIVE_BASE_PROM_INTERVAL_MSECS >#else >#define BASE_PROM_INTERVAL_MSECS DEFAULT_TIME_SLICE_MSECS >#endif >static unsigned int base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(BASE_PROM_INTERVAL_MSECS); > >typedef struct runqueue runqueue_t; > >struct prio_slot { > unsigned int prio; > struct list_head queue; >}; > >/* > * This is the main, per-CPU runqueue data structure. > * > * Locking rule: those places that want to lock multiple runqueues > * (such as the load balancing or the thread migration code), lock > * acquire operations must be ordered by ascending &runqueue. > */ >struct runqueue { > spinlock_t lock; > > /* > * nr_running and cpu_load should be in the same cacheline because > * remote CPUs use both these fields when doing load calculation. > */ > unsigned long nr_running; >#ifdef CONFIG_SMP > unsigned long cpu_load; >#endif > unsigned long long nr_switches; > unsigned long nr_uninterruptible; > unsigned long long timestamp_last_tick; > unsigned long long total_delay; > unsigned int cache_ticks, preempted; > task_t *curr, *idle; > struct mm_struct *prev_mm; > DECLARE_BITMAP(bitmap, NUM_PRIO_SLOTS); > struct prio_slot queues[NUM_PRIO_SLOTS]; > struct prio_slot *current_prio_slot; > unsigned long next_prom_due; > atomic_t nr_iowait; > > unsigned long long eb_yardstick; > unsigned long long eb_ticks_to_decay; > >#ifdef CONFIG_SMP > struct sched_domain *sd; > > /* For active balancing */ > int active_balance; > int push_cpu; > > task_t *migration_thread; > struct list_head migration_queue; >#endif >}; > >static DEFINE_PER_CPU(struct runqueue, runqueues); > >#define for_each_domain(cpu, domain) \ > for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) > >#define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) >#define this_rq() (&__get_cpu_var(runqueues)) >#define task_rq(p) cpu_rq(task_cpu(p)) >#define cpu_curr(cpu) (cpu_rq(cpu)->curr) > >/* > * Default context-switch locking: > */ >#ifndef prepare_arch_switch ># define prepare_arch_switch(rq, next) do { } while (0) ># define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) ># define task_running(rq, p) ((rq)->curr == (p)) >#endif > >static inline unsigned long get_prom_interval(const struct runqueue *rq) >{ > if (rq->nr_running < 2) > return base_prom_interval_ticks; > return (rq->nr_running - 1) * base_prom_interval_ticks; >} > >static inline void decay_eb_yardstick(runqueue_t *rq) >{ > static const unsigned long long decay_per_interval = PROP_FM_PPT(990); > > rq->eb_yardstick = map_proportion(decay_per_interval, rq->eb_yardstick); > rq->eb_ticks_to_decay = time_slice_ticks; >} > >#define EB_PAR 19 >static inline void set_eb_yardstick(runqueue_t *rq, task_t *p) >{ > rq->eb_yardstick = p->cpu_usage_rate_per_share; > p->eb_priority = MAX_RT_PRIO + EB_PAR; > rq->eb_ticks_to_decay = time_slice_ticks; >} > >static inline int task_should_be_yardstick(const task_t *p, const runqueue_t *rq) >{ > return (p->cpu_usage_rate_per_share > rq->eb_yardstick); >} > >static inline void update_eb_yardstick(task_t *p, runqueue_t *rq) >{ > if (unlikely(rt_task(p))) > return; > if (task_should_be_yardstick(p, rq)) > set_eb_yardstick(rq, p); >} > >/* > * task_rq_lock - lock the runqueue a given task resides on and disable > * interrupts. Note the ordering: we can safely lookup the task_rq without > * explicitly disabling preemption. > */ >static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) >{ > struct runqueue *rq; > >repeat_lock_task: > local_irq_save(*flags); > rq = task_rq(p); > spin_lock(&rq->lock); > if (unlikely(rq != task_rq(p))) { > spin_unlock_irqrestore(&rq->lock, *flags); > goto repeat_lock_task; > } > return rq; >} > >static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) >{ > spin_unlock_irqrestore(&rq->lock, *flags); >} > >/* > * rq_lock - lock a given runqueue and disable interrupts. > */ >static runqueue_t *this_rq_lock(void) >{ > runqueue_t *rq; > > local_irq_disable(); > rq = this_rq(); > spin_lock(&rq->lock); > > return rq; >} > >static inline void rq_unlock(runqueue_t *rq) >{ > spin_unlock_irq(&rq->lock); >} > >static inline int preemption_warranted(unsigned int prio, > const struct task_struct *p, runqueue_t *rq) >{ > if (prio < rq->current_prio_slot->prio) { > if (rt_task(p) || !sched_compute || > rq->cache_ticks >= cache_decay_ticks || > !p->mm || rq->curr == rq->idle) > return 1; > rq->preempted = 1; > } > > return 0; >} > >static inline int task_queued(const task_t *task) >{ > return !list_empty(&task->run_list); >} > >/* > * Adding/removing a task to/from a runqueue: > */ >static void dequeue_task(struct task_struct *p) >{ > /* > * If p is the last task in this priority slot then slotp will be > * a pointer to the head of the list in the sunqueue structure > */ > struct list_head *slotp = p->run_list.next; > > /* > * Initialize after removal from the list so that list_empty() works > * as a means for testing whether the task is runnable > */ > list_del_init(&p->run_list); > if (list_empty(slotp)) > __clear_bit(list_entry(slotp, struct prio_slot, queue)->prio, task_rq(p)->bitmap); >} > >static void enqueue_task(struct task_struct *p, runqueue_t *rq, int prio) >{ > list_add_tail(&p->run_list, &rq->queues[prio].queue); > __set_bit(prio, rq->bitmap); >} > >/* > * Used by the migration code - we pull tasks from the head of the > * remote queue so we want these tasks to show up at the head of the > * local queue: > */ >static inline void enqueue_task_head(struct task_struct *p, runqueue_t *rq, int prio) >{ > list_add(&p->run_list, &rq->queues[prio].queue); > __set_bit(prio, rq->bitmap); >} > >/* > * __activate_task - move a task to the runqueue. > */ >static inline void __activate_task(task_t *p, runqueue_t *rq, int prio) >{ > if (sched_mode != SCHED_MODE_STAIRCASE) > p->time_slice = task_timeslice(p); > enqueue_task(p, rq, prio); > rq->nr_running++; >} > >/* > * burst - extra intervals an interactive task can run for at best priority > * instead of descending priorities. > */ >static unsigned int burst(const task_t *p) >{ > unsigned int task_user_prio; > if (rt_task(p)) > return p->burst; > task_user_prio = TASK_USER_PRIO(p); > if (likely(task_user_prio < 40)) > return 39 - task_user_prio; > else > return 0; >} > >static void inc_burst(task_t *p) >{ > unsigned int best_burst; > best_burst = burst(p); > if (p->burst < best_burst) > p->burst++; >} > >static void dec_burst(task_t *p) >{ > if (p->burst) > p->burst--; >} > >/* > * slice - the duration a task runs before getting requeued at it's best > * priority and has it's burst decremented. > */ >static unsigned int slice(const task_t *p) >{ > unsigned int slice = RR_INTERVAL(); > if (!rt_task(p)) > slice += burst(p) * RR_INTERVAL(); > return slice; >} > >/* > * sched_interactive - sysctl which allows interactive tasks to have bursts > */ >static int sched_interactive = 1; > >static int hog_sub_cycle_threshold = 10; > >/* > * Calculate CPU usage rate and sleepiness. > * This never gets called on real time tasks > */ >static void calculate_rates(task_t *p) >{ > unsigned long long bl = p->avg_sleep_per_cycle + p->avg_cpu_per_cycle; > > if (unlikely(bl == 0)) { > p->sleepiness = PROPORTION_ONE; > p->cpu_usage_rate = 0; > } else { > unsigned long long edpss = p->avg_delay_per_cycle; > > p->sleepiness = calc_proportion(p->avg_sleep_per_cycle, bl); > p->cpu_usage_rate = calc_proportion(p->avg_cpu_per_cycle, edpss + bl); > if (unlikely(p->sub_cycle_count > hog_sub_cycle_threshold)) { > unsigned long long scu; > unsigned long long sbl; > > sbl = p->avg_delay_per_sub_cycle + p->avg_cpu_per_sub_cycle; > scu = calc_proportion(p->avg_cpu_per_sub_cycle, sbl); > /* if (scu > p->cpu_usage_rate) */ > p->cpu_usage_rate = scu; > } > } > p->cpu_usage_rate_per_share = sched_div_64(p->cpu_usage_rate, p->eb_shares); >} > >/* > * Calculate entitlement based priority. > * This never gets called on real time tasks > */ >static void calculate_eb_priority(task_t *p, const runqueue_t *rq) >{ > /* > * Prevent possible divide by zero and take shortcut > */ > if (unlikely(p->cpu_usage_rate_per_share == 0)) { > p->eb_priority = MAX_RT_PRIO; > } else if (unlikely(p->cpu_usage_rate_per_share > rq->eb_yardstick)) { > unsigned long long prop = calc_proportion(rq->eb_yardstick, p->cpu_usage_rate_per_share); > > p->eb_priority = MAX_PRIO - map_proportion_rnd(prop, EB_PAR + 1); > } else { > unsigned long long prop = calc_proportion(p->cpu_usage_rate_per_share, rq->eb_yardstick); > > p->eb_priority = MAX_RT_PRIO + map_proportion_rnd(prop, EB_PAR); > } >} > >/* > * Update various statistics for the end of a > * ((on_run_queue :-> on_cpu)* :-> sleep) cycle. > * We can't just do this in activate_task() as every invocation of that > * function is not the genuine end of a cycle. > */ >static void update_stats_for_cycle(task_t *p, const runqueue_t *rq) >{ > unsigned long long delta; > > apply_sched_avg_decay(&p->avg_delay_per_cycle); > apply_sched_avg_decay(&p->avg_cpu_per_cycle); > delta = (rq->timestamp_last_tick - p->sched_timestamp); > p->avg_sleep_per_cycle += delta; > p->total_sleep += delta; > /* > * Do this second so that averages for all measures are for > * the current cycle > */ > apply_sched_avg_decay(&p->avg_sleep_per_cycle); > p->sched_timestamp = rq->timestamp_last_tick; > p->sub_cycle_count = 0; > p->cycle_count++; > if (!rt_task(p)) { > /* we con't care about these for real time tasks */ > apply_sched_avg_decay(&p->avg_delay_per_sub_cycle); > apply_sched_avg_decay(&p->avg_cpu_per_sub_cycle); > if (sched_mode != SCHED_MODE_STAIRCASE) { > calculate_rates(p); > if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) > calculate_eb_priority(p, rq); > } > } >} > >/* > * Check whether a task with an interactive bonus still qualifies and if not > * decrease its bonus > * This never gets called on real time tasks > */ >static void reassess_cpu_boundness(task_t *p) >{ > /* > * No point going any further if there's no bonus to lose > */ > if (p->interactive_bonus == 0) > return; > > if (p->cpu_usage_rate > cpu_hog_threshold) > update_sched_ia_bonus(p, 0); >} > >/* > * Check whether a task qualifies for an interactive bonus and if it does > * increase its bonus > * This never gets called on real time tasks > */ >static void reassess_interactiveness(task_t *p) >{ > /* > * No sleep means not interactive (in most cases), but > */ > if (p->avg_sleep_per_cycle > LOWER_MAX_IA_SLEEP) { > /* > * Really long sleeps mean it's probably not interactive > */ > if (p->avg_sleep_per_cycle > UPPER_MAX_IA_SLEEP) > update_sched_ia_bonus(p, 0); > return; > } > if (p->sleepiness > ia_threshold) > update_sched_ia_bonus(p, p->sleepiness); > else if (p->sub_cycle_count == 0) > reassess_cpu_boundness(p); >} > >/* > * Check whether a task qualifies for a throughput bonus and if it does > * give it one > * This never gets called on real time tasks > */ >static void recalc_throughput_bonus(task_t *p, unsigned long long load) >{ > if (unlikely(p->sub_cycle_count > hog_sub_cycle_threshold)) { > /* > * No delay means no bonus, but > * NB this test also avoids a possible divide by zero error if > * cpu is also zero > */ > if (p->avg_delay_per_sub_cycle == 0) { > p->throughput_bonus = 0; > return; > } > p->throughput_bonus = calc_proportion(p->avg_delay_per_sub_cycle, > p->avg_delay_per_sub_cycle + load * p->avg_cpu_per_sub_cycle); > return; > } > /* > * No delay means no bonus, but > * NB this test also avoids a possible divide by zero error if > * cpu is also zero > */ > if (p->avg_delay_per_cycle == 0) { > p->throughput_bonus = 0; > return; > } > p->throughput_bonus = calc_proportion(p->avg_delay_per_cycle, > p->avg_delay_per_cycle + load * p->avg_cpu_per_cycle); >} > >/* > * effective_prio - dynamic priority dependent on burst. > * The priority normally decreases by one each RR_INTERVAL. > * As the burst increases the priority stays at the top "stair" or > * priority for longer. > */ >static int effective_prio(task_t *p) >{ > int prio; > unsigned int full_slice, used_slice, first_slice; > unsigned int best_burst; > unsigned int miabl, mtpbl, bonus_factor; > > if (rt_task(p)) > return (MAX_USER_RT_PRIO - 1) - p->rt_priority; > > switch (sched_mode) { > case SCHED_MODE_STAIRCASE: > goto staircase_prio; > case SCHED_MODE_ENTITLEMENT_BASED: > prio = p->eb_priority; > break; > default: > prio = p->static_prio; > } > > /* > * kernel threads get maximum bonuses > */ > if (p->mm == NULL) > return prio; > > miabl = max_ia_bonus; > mtpbl = max_tpt_bonus; > bonus_factor = (miabl + mtpbl); > bonus_factor -= map_proportion_rnd(SCHED_IA_BONUS_RND(p->interactive_bonus), miabl); > bonus_factor -= map_proportion_rnd(p->throughput_bonus, mtpbl); > > return prio + bonus_factor; > >staircase_prio: > best_burst = burst(p); > full_slice = slice(p); > used_slice = full_slice - p->slice; > if (p->burst > best_burst) > p->burst = best_burst; > first_slice = RR_INTERVAL(); > if (sched_interactive && !sched_compute) > first_slice *= (p->burst + 1); > prio = MAX_PRIO - 1 - best_burst; > > if (used_slice < first_slice) > return prio; > prio += 1 + (used_slice - first_slice) / RR_INTERVAL(); > if (prio > MAX_PRIO - 1) > prio = MAX_PRIO - 1; > return prio; >} > >/* > * recalc_task_prio - this checks for tasks that run ultra short timeslices > * or have just forked a thread/process and make them continue their old > * slice instead of starting a new one at high priority. > * This is not called on real time tasks > */ >static void recalc_task_prio(task_t *p, unsigned long long now) >{ > unsigned long sleep_time = now - p->timestamp; > unsigned long ns_totalrun = p->totalrun + p->runtime; > unsigned long total_run = NS_TO_JIFFIES(ns_totalrun); > if (p->flags & PF_FORKED || ((!(NS_TO_JIFFIES(p->runtime)) || > !sched_interactive || sched_compute) && > NS_TO_JIFFIES(p->runtime + sleep_time) < RR_INTERVAL())) { > p->flags &= ~PF_FORKED; > if (p->slice - total_run < 1) { > p->totalrun = 0; > dec_burst(p); > } else { > p->totalrun = ns_totalrun; > p->slice -= total_run; > } > } else { > if (!(p->flags & PF_UISLEEP)) > inc_burst(p); > p->runtime = 0; > p->totalrun = 0; > } >} > >/* > * activate_task - move a task to the runqueue and do priority recalculation > * return prio to allow preemption testing > */ >static int activate_task(task_t *p, runqueue_t *rq, int local) >{ > int prio; > unsigned long long now = sched_clock(); > >#ifdef CONFIG_SMP > if (!local) { > /* Compensate for drifting sched_clock */ > runqueue_t *this_rq = this_rq(); > now = (now - this_rq->timestamp_last_tick) > + rq->timestamp_last_tick; > } >#endif > if (sched_mode == SCHED_MODE_STAIRCASE) { > if (!rt_task(p)) { > p->slice = slice(p); > recalc_task_prio(p, now); > } > p->time_slice = RR_INTERVAL(); > } > p->flags &= ~PF_UISLEEP; > prio = effective_prio(p); > p->timestamp = now; > __activate_task(p, rq, prio); > > return prio; >} > >/* > * deactivate_task - remove a task from the runqueue. > */ >static void deactivate_task(struct task_struct *p, runqueue_t *rq) >{ > rq->nr_running--; > if (p->state == TASK_UNINTERRUPTIBLE) { > p->flags |= PF_UISLEEP; > rq->nr_uninterruptible++; > } > dequeue_task(p); >} > >/* > * resched_task - mark a task 'to be rescheduled now'. > * > * On UP this means the setting of the need_resched flag, on SMP it > * might also involve a cross-CPU call to trigger the scheduler on > * the target CPU. > */ >#ifdef CONFIG_SMP >static void resched_task(task_t *p) >{ > int need_resched, nrpolling; > > preempt_disable(); > /* minimise the chance of sending an interrupt to poll_idle() */ > nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); > need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); > nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); > > if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) > smp_send_reschedule(task_cpu(p)); > preempt_enable(); >} >#else >static inline void resched_task(task_t *p) >{ > set_tsk_need_resched(p); >} >#endif > >/** > * task_curr - is this task currently executing on a CPU? > * @p: the task in question. > */ >inline int task_curr(task_t *p) >{ > return cpu_curr(task_cpu(p)) == p; >} > >#ifdef CONFIG_SMP >enum request_type { > REQ_MOVE_TASK, > REQ_SET_DOMAIN, >}; > >typedef struct { > struct list_head list; > enum request_type type; > > /* For REQ_MOVE_TASK */ > task_t *task; > int dest_cpu; > > /* For REQ_SET_DOMAIN */ > struct sched_domain *sd; > > struct completion done; >} migration_req_t; > >/* > * The task's runqueue lock must be held. > * Returns true if you have to wait for migration thread. > */ >static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) >{ > runqueue_t *rq = task_rq(p); > > /* > * If the task is not on a runqueue (and not running), then > * it is sufficient to simply update the task's cpu field. > */ > if (!task_queued(p) && !task_running(rq, p)) { > set_task_cpu(p, dest_cpu); > return 0; > } > > init_completion(&req->done); > req->type = REQ_MOVE_TASK; > req->task = p; > req->dest_cpu = dest_cpu; > list_add(&req->list, &rq->migration_queue); > return 1; >} > >/* > * wait_task_inactive - wait for a thread to unschedule. > * > * The caller must ensure that the task *will* unschedule sometime soon, > * else this function might spin for a *long* time. This function can't > * be called with interrupts off, or it may introduce deadlock with > * smp_call_function() if an IPI is sent by the same process we are > * waiting to become inactive. > */ >void wait_task_inactive(task_t * p) >{ > unsigned long flags; > runqueue_t *rq; > int preempted; > >repeat: > rq = task_rq_lock(p, &flags); > /* Must be off runqueue entirely, not preempted. */ > if (unlikely(task_queued(p))) { > /* If it's preempted, we yield. It could be a while. */ > preempted = !task_running(rq, p); > task_rq_unlock(rq, &flags); > cpu_relax(); > if (preempted) > yield(); > goto repeat; > } > task_rq_unlock(rq, &flags); >} > >/*** > * kick_process - kick a running thread to enter/exit the kernel > * @p: the to-be-kicked thread > * > * Cause a process which is running on another CPU to enter > * kernel-mode, without any delay. (to get signals handled.) > */ >void kick_process(task_t *p) >{ > int cpu; > > preempt_disable(); > cpu = task_cpu(p); > if ((cpu != smp_processor_id()) && task_curr(p)) > smp_send_reschedule(cpu); > preempt_enable(); >} > >EXPORT_SYMBOL_GPL(kick_process); > >/* > * Return a low guess at the load of a migration-source cpu. > * > * We want to under-estimate the load of migration sources, to > * balance conservatively. > */ >static inline unsigned long source_load(int cpu) >{ > runqueue_t *rq = cpu_rq(cpu); > unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; > > return min(rq->cpu_load, load_now); >} > >/* > * Return a high guess at the load of a migration-target cpu > */ >static inline unsigned long target_load(int cpu) >{ > runqueue_t *rq = cpu_rq(cpu); > unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; > > return max(rq->cpu_load, load_now); >} > >#endif > >/* > * wake_idle() is useful especially on SMT architectures to wake a > * task onto an idle sibling if we would otherwise wake it onto a > * busy sibling. > * > * Returns the CPU we should wake onto. > */ >#if defined(ARCH_HAS_SCHED_WAKE_IDLE) >static int wake_idle(int cpu, task_t *p) >{ > cpumask_t tmp; > runqueue_t *rq = cpu_rq(cpu); > struct sched_domain *sd; > int i; > > if (idle_cpu(cpu)) > return cpu; > > sd = rq->sd; > if (!(sd->flags & SD_WAKE_IDLE)) > return cpu; > > cpus_and(tmp, sd->span, cpu_online_map); > for_each_cpu_mask(i, tmp) { > if (!cpu_isset(i, p->cpus_allowed)) > continue; > > if (idle_cpu(i)) > return i; > } > > return cpu; >} >#else >static inline int wake_idle(int cpu, task_t *p) >{ > return cpu; >} >#endif > >/*** > * try_to_wake_up - wake up a thread > * @p: the to-be-woken-up thread > * @state: the mask of task states that can be woken > * @sync: do a synchronous wakeup? > * > * Put it on the run-queue if it's not already there. The "current" > * thread is always on the run-queue (except when the actual > * re-schedule is in progress), and as such you're allowed to do > * the simpler "current->state = TASK_RUNNING" to mark yourself > * runnable without the overhead of this. > * > * returns failure only if the task is already active. > */ >static int try_to_wake_up(task_t * p, unsigned int state, int sync) >{ > int cpu, this_cpu, success = 0; > unsigned long flags; > long old_state; > runqueue_t *rq; > int prio; >#ifdef CONFIG_SMP > unsigned long load, this_load; > struct sched_domain *sd; > int new_cpu; >#endif > > rq = task_rq_lock(p, &flags); > old_state = p->state; > if (!(old_state & state)) > goto out; > > if (task_queued(p)) > goto out_running; > > cpu = task_cpu(p); > this_cpu = smp_processor_id(); > >#ifdef CONFIG_SMP > if (unlikely(task_running(rq, p))) > goto out_activate; > > new_cpu = cpu; > > if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) > goto out_set_cpu; > > load = source_load(cpu); > this_load = target_load(this_cpu); > > /* > * If sync wakeup then subtract the (maximum possible) effect of > * the currently running task from the load of the current CPU: > */ > if (sync) > this_load -= SCHED_LOAD_SCALE; > > /* Don't pull the task off an idle CPU to a busy one */ > if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) > goto out_set_cpu; > > new_cpu = this_cpu; /* Wake to this CPU if we can */ > > /* > * Scan domains for affine wakeup and passive balancing > * possibilities. > */ > for_each_domain(this_cpu, sd) { > unsigned int imbalance; > /* > * Start passive balancing when half the imbalance_pct > * limit is reached. > */ > imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; > > if ( ((sd->flags & SD_WAKE_AFFINE) && > !task_hot(p, rq->timestamp_last_tick, sd)) > || ((sd->flags & SD_WAKE_BALANCE) && > imbalance*this_load <= 100*load) ) { > /* > * Now sd has SD_WAKE_AFFINE and p is cache cold in sd > * or sd has SD_WAKE_BALANCE and there is an imbalance > */ > if (cpu_isset(cpu, sd->span)) > goto out_set_cpu; > } > } > > new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ >out_set_cpu: > new_cpu = wake_idle(new_cpu, p); > if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { > set_task_cpu(p, new_cpu); > task_rq_unlock(rq, &flags); > /* might preempt at this point */ > rq = task_rq_lock(p, &flags); > old_state = p->state; > if (!(old_state & state)) > goto out; > if (task_queued(p)) > goto out_running; > > this_cpu = smp_processor_id(); > cpu = task_cpu(p); > } > >out_activate: >#endif /* CONFIG_SMP */ > if (old_state == TASK_UNINTERRUPTIBLE) > rq->nr_uninterruptible--; > > /* > * This is the end of one scheduling cycle and the start > * of the next > */ > update_stats_for_cycle(p, rq); > if (!rt_task(p) && (sched_mode != SCHED_MODE_STAIRCASE)) { > recalc_throughput_bonus(p, rq->nr_running + 1); > reassess_interactiveness(p); > } > /* > * Sync wakeups (i.e. those types of wakeups where the waker > * has indicated that it will leave the CPU in short order) > * don't trigger a preemption, if the woken up task will run on > * this cpu. (in this case the 'I will reschedule' promise of > * the waker guarantees that the freshly woken up task is going > * to be considered on this CPU.) > */ > prio = activate_task(p, rq, cpu == this_cpu); > if (!sync || cpu != this_cpu) { > if (preemption_warranted(prio, p, rq)) > resched_task(rq->curr); > } > success = 1; > >out_running: > p->state = TASK_RUNNING; >out: > task_rq_unlock(rq, &flags); > > return success; >} > >int fastcall wake_up_process(task_t * p) >{ > return try_to_wake_up(p, TASK_STOPPED | > TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); >} > >EXPORT_SYMBOL(wake_up_process); > >int fastcall wake_up_state(task_t *p, unsigned int state) >{ > return try_to_wake_up(p, state, 0); >} > >/* > * Initialize the scheduling statistics counters > */ >static inline void initialize_stats(task_t *p) >{ > p->avg_sleep_per_cycle = 0; > p->avg_delay_per_cycle = 0; > p->avg_delay_per_sub_cycle = 0; > p->avg_cpu_per_cycle = 0; > p->avg_cpu_per_sub_cycle = 0; > p->total_sleep = 0; > p->total_delay = 0; > p->total_cpu = 0; > p->cycle_count = 0; > p->sched_timestamp = 0 /* set this to current time later */; >} > >/* > * Initialize the scheduling bonuses > */ >static inline void initialize_bonuses(task_t *p) >{ > p->interactive_bonus = (max_ia_bonus >= initial_ia_bonus) ? > initial_ia_bonus : max_ia_bonus; > p->throughput_bonus = 0; > p->sub_cycle_count = 0; >} > >/* > * Perform scheduler related setup for a newly forked process p. > * p is forked by current. > */ >void fastcall sched_fork(task_t *p) >{ > /* > * We mark the process as running here, but have not actually > * inserted it onto the runqueue yet. This guarantees that > * nobody will actually run it, and a signal or other external > * event cannot wake it up and insert it on the runqueue either. > */ > p->state = TASK_RUNNING; > INIT_LIST_HEAD(&p->run_list); > spin_lock_init(&p->switch_lock); >#ifdef CONFIG_PREEMPT > /* > * During context-switch we hold precisely one spinlock, which > * schedule_tail drops. (in the common case it's this_rq()->lock, > * but it also can be p->switch_lock.) So we compensate with a count > * of 1. Also, we want to start with kernel preemption disabled. > */ > p->thread_info->preempt_count = 1; >#endif > /* > * Give the child a new timeslice > */ > if (sched_mode != SCHED_MODE_STAIRCASE) > p->time_slice = task_timeslice(p); > /* > * Initialize the scheduling statistics and bonus counters > */ > initialize_stats(p); > initialize_bonuses(p); >} > >/* > * wake_up_forked_process - wake up a freshly forked process. > * > * This function will do some initial scheduler statistics housekeeping > * that must be done for every newly created process. > */ >void fastcall wake_up_forked_process(task_t * p) >{ > unsigned long flags; > runqueue_t *rq = task_rq_lock(current, &flags); > > /* > * Forked process gets no burst to prevent fork bombs. > */ > p->burst = 0; > BUG_ON(p->state != TASK_RUNNING); > > set_task_cpu(p, smp_processor_id()); > > /* > * Scheduling statistics compilation starts now > */ > p->sched_timestamp = rq->timestamp_last_tick; > > /* > * Now that the idle task is back on the run queue we need extra care > * to make sure that its one and only fork() doesn't end up in the idle > * priority slot. Just testing for empty run list is no longer adequate. > */ > if (unlikely(!task_queued(current) || RUNQUEUE_IDLE(rq))) > __activate_task(p, rq, effective_prio(p)); > else { > /* > * Put the child on the same list(s) as (but ahead of) the parent > */ > list_add_tail(&p->run_list, ¤t->run_list); > rq->nr_running++; > } > current->flags |= PF_FORKED; > task_rq_unlock(rq, &flags); >} > >/** > * (Optionally) log scheduler statistics at exit. > */ >static int log_at_exit = 0; >void fastcall sched_exit(task_t * p) >{ > struct task_sched_stats stats; > > if (!log_at_exit) > return; > > get_task_sched_stats(p, &stats); > printk("SCHED_EXIT[%d] (%s) %llu %llu %llu %llu %lu %lu %lu %lu\n", > p->pid, p->comm, > stats.total_sleep, stats.total_cpu, stats.total_delay, > stats.cycle_count, > p->nvcsw, p->nivcsw, p->cnvcsw, p->cnivcsw); >} > >/** > * finish_task_switch - clean up after a task-switch > * @prev: the thread we just switched away from. > * > * We enter this with the runqueue still locked, and finish_arch_switch() > * will unlock it along with doing any other architecture-specific cleanup > * actions. > * > * Note that we may have delayed dropping an mm in context_switch(). If > * so, we finish that here outside of the runqueue lock. (Doing it > * with the lock held can cause deadlocks; see schedule() for > * details.) > */ >static void finish_task_switch(task_t *prev) >{ > runqueue_t *rq = this_rq(); > struct mm_struct *mm = rq->prev_mm; > unsigned long prev_task_flags; > > rq->prev_mm = NULL; > > /* > * A task struct has one reference for the use as "current". > * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls > * schedule one last time. The schedule call will never return, > * and the scheduled task must drop that reference. > * The test for TASK_ZOMBIE must occur while the runqueue locks are > * still held, otherwise prev could be scheduled on another cpu, die > * there before we look at prev->state, and then the reference would > * be dropped twice. > * Manfred Spraul <manfred@colorfullife.com> > */ > prev_task_flags = prev->flags; > finish_arch_switch(rq, prev); > if (mm) > mmdrop(mm); > if (unlikely(prev_task_flags & PF_DEAD)) > put_task_struct(prev); >} > >/** > * schedule_tail - first thing a freshly forked thread must call. > * @prev: the thread we just switched away from. > */ >asmlinkage void schedule_tail(task_t *prev) >{ > finish_task_switch(prev); > > if (current->set_child_tid) > put_user(current->pid, current->set_child_tid); >} > >/* > * context_switch - switch to the new MM and the new > * thread's register state. > */ >static inline >task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) >{ > struct mm_struct *mm = next->mm; > struct mm_struct *oldmm = prev->active_mm; > > if (unlikely(!mm)) { > next->active_mm = oldmm; > atomic_inc(&oldmm->mm_count); > enter_lazy_tlb(oldmm, next); > } else > switch_mm(oldmm, mm, next); > > if (unlikely(!prev->mm)) { > prev->active_mm = NULL; > WARN_ON(rq->prev_mm); > rq->prev_mm = oldmm; > } > > /* Here we just switch the register state and the stack. */ > switch_to(prev, next, prev); > > return prev; >} > >/* > * nr_running, nr_uninterruptible and nr_context_switches: > * > * externally visible scheduler statistics: current number of runnable > * threads, current number of uninterruptible-sleeping threads, total > * number of context switches performed since bootup. > */ >unsigned long nr_running(void) >{ > unsigned long i, sum = 0; > > for_each_cpu(i) > sum += cpu_rq(i)->nr_running; > > return sum; >} > >unsigned long nr_uninterruptible(void) >{ > unsigned long i, sum = 0; > > for_each_online_cpu(i) > sum += cpu_rq(i)->nr_uninterruptible; > > return sum; >} > >unsigned long long nr_context_switches(void) >{ > unsigned long long i, sum = 0; > > for_each_online_cpu(i) > sum += cpu_rq(i)->nr_switches; > > return sum; >} > >unsigned long nr_iowait(void) >{ > unsigned long i, sum = 0; > > for_each_online_cpu(i) > sum += atomic_read(&cpu_rq(i)->nr_iowait); > > return sum; >} > >/* > * double_rq_lock - safely lock two runqueues > * > * Note this does not disable interrupts like task_rq_lock, > * you need to do so manually before calling. > */ >static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) >{ > if (rq1 == rq2) > spin_lock(&rq1->lock); > else { > if (rq1 < rq2) { > spin_lock(&rq1->lock); > spin_lock(&rq2->lock); > } else { > spin_lock(&rq2->lock); > spin_lock(&rq1->lock); > } > } >} > >/* > * double_rq_unlock - safely unlock two runqueues > * > * Note this does not restore interrupts like task_rq_unlock, > * you need to do so manually after calling. > */ >static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) >{ > spin_unlock(&rq1->lock); > if (rq1 != rq2) > spin_unlock(&rq2->lock); >} > >enum idle_type >{ > IDLE, > NOT_IDLE, > NEWLY_IDLE, >}; > >#ifdef CONFIG_SMP > >/* > * find_idlest_cpu - find the least busy runqueue. > */ >static int find_idlest_cpu(const struct task_struct *p, int this_cpu, > struct sched_domain *sd) >{ > unsigned long load, min_load, this_load; > int i, min_cpu; > cpumask_t mask; > > min_cpu = UINT_MAX; > min_load = ULONG_MAX; > > cpus_and(mask, sd->span, cpu_online_map); > cpus_and(mask, mask, p->cpus_allowed); > > for_each_cpu_mask(i, mask) { > load = target_load(i); > > if (load < min_load) { > min_cpu = i; > min_load = load; > > /* break out early on an idle CPU: */ > if (!min_load) > break; > } > } > > /* add +1 to account for the new task */ > this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; > > /* > * Would with the addition of the new task to the > * current CPU there be an imbalance between this > * CPU and the idlest CPU? > * > * Use half of the balancing threshold - new-context is > * a good opportunity to balance. > */ > if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) > return min_cpu; > > return this_cpu; >} > >/* > * wake_up_forked_thread - wake up a freshly forked thread. > * > * This function will do some initial scheduler statistics housekeeping > * that must be done for every newly created context, and it also does > * runqueue balancing. > */ >void fastcall wake_up_forked_thread(task_t * p) >{ > unsigned long flags; > int this_cpu = get_cpu(), cpu; > struct sched_domain *tmp, *sd = NULL; > runqueue_t *this_rq = cpu_rq(this_cpu), *rq; > > /* > * Find the largest domain that this CPU is part of that > * is willing to balance on clone: > */ > for_each_domain(this_cpu, tmp) > if (tmp->flags & SD_BALANCE_CLONE) > sd = tmp; > if (sd) > cpu = find_idlest_cpu(p, this_cpu, sd); > else > cpu = this_cpu; > > local_irq_save(flags); >lock_again: > rq = cpu_rq(cpu); > double_rq_lock(this_rq, rq); > > BUG_ON(p->state != TASK_RUNNING); > > /* > * We did find_idlest_cpu() unlocked, so in theory > * the mask could have changed - just dont migrate > * in this case: > */ > if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) { > cpu = this_cpu; > double_rq_unlock(this_rq, rq); > goto lock_again; > } > > set_task_cpu(p, cpu); > > /* > * Scheduling statistics compilation starts now > */ > p->sched_timestamp = rq->timestamp_last_tick; > > if (cpu == this_cpu) { > /* > * Now that the idle task is back on the run queue we need > * extra care to make sure that its one and only fork() doesn't > * end up in the idle priority slot. Just testing for empty > * run list is no longer adequate. > */ > if (unlikely(!task_queued(current) || RUNQUEUE_IDLE(rq))) > __activate_task(p, rq, effective_prio(p)); > else { > list_add_tail(&p->run_list, ¤t->run_list); > rq->nr_running++; > } > } else { > int prio = effective_prio(p); > /* Not the local CPU - must adjust timestamp */ > p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) > + rq->timestamp_last_tick; > __activate_task(p, rq, prio); > if (preemption_warranted(prio, p, rq)) > resched_task(rq->curr); > } > > double_rq_unlock(this_rq, rq); > local_irq_restore(flags); > put_cpu(); >} > >/* > * If dest_cpu is allowed for this process, migrate the task to it. > * This is accomplished by forcing the cpu_allowed mask to only > * allow dest_cpu, which will force the cpu onto dest_cpu. Then > * the cpu_allowed mask is restored. > */ >static void sched_migrate_task(task_t *p, int dest_cpu) >{ > migration_req_t req; > runqueue_t *rq; > unsigned long flags; > > rq = task_rq_lock(p, &flags); > if (!cpu_isset(dest_cpu, p->cpus_allowed) > || unlikely(cpu_is_offline(dest_cpu))) > goto out; > > /* force the process onto the specified CPU */ > if (migrate_task(p, dest_cpu, &req)) { > /* Need to wait for migration thread (might exit: take ref). */ > struct task_struct *mt = rq->migration_thread; > get_task_struct(mt); > task_rq_unlock(rq, &flags); > wake_up_process(mt); > put_task_struct(mt); > wait_for_completion(&req.done); > return; > } >out: > task_rq_unlock(rq, &flags); >} > >/* > * sched_balance_exec(): find the highest-level, exec-balance-capable > * domain and try to migrate the task to the least loaded CPU. > * > * execve() is a valuable balancing opportunity, because at this point > * the task has the smallest effective memory and cache footprint. > */ >void sched_balance_exec(void) >{ > struct sched_domain *tmp, *sd = NULL; > int new_cpu, this_cpu = get_cpu(); > > /* Prefer the current CPU if there's only this task running */ > if (this_rq()->nr_running <= 1) > goto out; > > for_each_domain(this_cpu, tmp) > if (tmp->flags & SD_BALANCE_EXEC) > sd = tmp; > > if (sd) { > new_cpu = find_idlest_cpu(current, this_cpu, sd); > if (new_cpu != this_cpu) { > put_cpu(); > sched_migrate_task(current, new_cpu); > return; > } > } >out: > put_cpu(); >} > >/* > * double_lock_balance - lock the busiest runqueue, this_rq is locked already. > */ >static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) >{ > if (unlikely(!spin_trylock(&busiest->lock))) { > if (busiest < this_rq) { > spin_unlock(&this_rq->lock); > spin_lock(&busiest->lock); > spin_lock(&this_rq->lock); > } else > spin_lock(&busiest->lock); > } >} > >/* > * pull_task - move a task from a remote runqueue to the local runqueue. > * Both runqueues must be locked. > */ >static inline >void pull_task(runqueue_t *src_rq, task_t *p, > runqueue_t *this_rq, int this_cpu, int prio) >{ > unsigned long long delta; > > dequeue_task(p); > src_rq->nr_running--; > delta = (src_rq->timestamp_last_tick - p->sched_timestamp); > p->avg_delay_per_cycle += delta; > p->avg_delay_per_sub_cycle += delta; > p->total_delay += delta; > set_task_cpu(p, this_cpu); > this_rq->nr_running++; > p->sched_timestamp = this_rq->timestamp_last_tick; > enqueue_task(p, this_rq, prio); > p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) > + this_rq->timestamp_last_tick; > /* > * Note that idle threads have a prio of IDLE_PRIO, for this test > * to be always true for them. > */ > if (preemption_warranted(prio, p, this_rq)) > resched_task(this_rq->curr); >} > >/* > * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? > */ >static inline >int can_migrate_task(const task_t *p, runqueue_t *rq, int this_cpu, > struct sched_domain *sd, enum idle_type idle) >{ > /* > * We do not migrate tasks that are: > * 1) running (obviously), or > * 2) cannot be migrated to this CPU due to cpus_allowed, or > * 3) are cache-hot on their current CPU. > */ > if (task_running(rq, p)) > return 0; > if (!cpu_isset(this_cpu, p->cpus_allowed)) > return 0; > > /* Aggressive migration if we've failed balancing */ > if (idle == NEWLY_IDLE || > sd->nr_balance_failed < sd->cache_nice_tries) { > if (task_hot(p, rq->timestamp_last_tick, sd)) > return 0; > } > > return 1; >} > >/* > * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, > * as part of a balancing operation within "domain". Returns the number of > * tasks moved. > * > * Called with both runqueues locked. > */ >static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, > unsigned long max_nr_move, struct sched_domain *sd, > enum idle_type idle) >{ > struct list_head *head, *curr; > int idx, pulled = 0; > task_t *tmp; > > if (max_nr_move <= 0 || busiest->nr_running <= 1) > goto out; > > /* Start searching at priority 0: */ > idx = 0; >skip_bitmap: > if (!idx) > idx = sched_find_first_bit(busiest->bitmap); > else > idx = find_next_bit(busiest->bitmap, IDLE_PRIO, idx); > if (idx >= IDLE_PRIO) > goto out; > > head = &busiest->queues[idx].queue; > curr = head->prev; >skip_queue: > tmp = list_entry(curr, task_t, run_list); > > curr = curr->prev; > > if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { > if (curr != head) > goto skip_queue; > idx++; > goto skip_bitmap; > } > pull_task(busiest, tmp, this_rq, this_cpu, idx); > pulled++; > > /* We only want to steal up to the prescribed number of tasks. */ > if (pulled < max_nr_move) { > if (curr != head) > goto skip_queue; > idx++; > goto skip_bitmap; > } >out: > return pulled; >} > >/* > * find_busiest_group finds and returns the busiest CPU group within the > * domain. It calculates and returns the number of tasks which should be > * moved to restore balance via the imbalance parameter. > */ >static struct sched_group * >find_busiest_group(struct sched_domain *sd, int this_cpu, > unsigned long *imbalance, enum idle_type idle) >{ > struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; > unsigned long max_load, avg_load, total_load, this_load, total_pwr; > > max_load = this_load = total_load = total_pwr = 0; > > do { > cpumask_t tmp; > unsigned long load; > int local_group; > int i, nr_cpus = 0; > > local_group = cpu_isset(this_cpu, group->cpumask); > > /* Tally up the load of all CPUs in the group */ > avg_load = 0; > cpus_and(tmp, group->cpumask, cpu_online_map); > if (unlikely(cpus_empty(tmp))) > goto nextgroup; > > for_each_cpu_mask(i, tmp) { > /* Bias balancing toward cpus of our domain */ > if (local_group) > load = target_load(i); > else > load = source_load(i); > > nr_cpus++; > avg_load += load; > } > > if (!nr_cpus) > goto nextgroup; > > total_load += avg_load; > total_pwr += group->cpu_power; > > /* Adjust by relative CPU power of the group */ > avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; > > if (local_group) { > this_load = avg_load; > this = group; > goto nextgroup; > } else if (avg_load > max_load) { > max_load = avg_load; > busiest = group; > } >nextgroup: > group = group->next; > } while (group != sd->groups); > > if (!busiest || this_load >= max_load) > goto out_balanced; > > avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; > > if (this_load >= avg_load || > 100*max_load <= sd->imbalance_pct*this_load) > goto out_balanced; > > /* > * We're trying to get all the cpus to the average_load, so we don't > * want to push ourselves above the average load, nor do we wish to > * reduce the max loaded cpu below the average load, as either of these > * actions would just result in more rebalancing later, and ping-pong > * tasks around. Thus we look for the minimum possible imbalance. > * Negative imbalances (*we* are more loaded than anyone else) will > * be counted as no imbalance for these purposes -- we can't fix that > * by pulling tasks to us. Be careful of negative numbers as they'll > * appear as very large values with unsigned longs. > */ > *imbalance = min(max_load - avg_load, avg_load - this_load); > > /* How much load to actually move to equalise the imbalance */ > *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) > / SCHED_LOAD_SCALE; > > if (*imbalance < SCHED_LOAD_SCALE - 1) { > unsigned long pwr_now = 0, pwr_move = 0; > unsigned long tmp; > > if (max_load - this_load >= SCHED_LOAD_SCALE*2) { > *imbalance = 1; > return busiest; > } > > /* > * OK, we don't have enough imbalance to justify moving tasks, > * however we may be able to increase total CPU power used by > * moving them. > */ > > pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); > pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); > pwr_now /= SCHED_LOAD_SCALE; > > /* Amount of load we'd subtract */ > tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; > if (max_load > tmp) > pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, > max_load - tmp); > > /* Amount of load we'd add */ > tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; > if (max_load < tmp) > tmp = max_load; > pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); > pwr_move /= SCHED_LOAD_SCALE; > > /* Move if we gain another 8th of a CPU worth of throughput */ > if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) > goto out_balanced; > > *imbalance = 1; > return busiest; > } > > /* Get rid of the scaling factor, rounding down as we divide */ > *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; > > return busiest; > >out_balanced: > if (busiest && (idle == NEWLY_IDLE || > (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) { > *imbalance = 1; > return busiest; > } > > *imbalance = 0; > return NULL; >} > >/* > * find_busiest_queue - find the busiest runqueue among the cpus in group. > */ >static runqueue_t *find_busiest_queue(const struct sched_group *group) >{ > cpumask_t tmp; > unsigned long load, max_load = 0; > runqueue_t *busiest = NULL; > int i; > > cpus_and(tmp, group->cpumask, cpu_online_map); > for_each_cpu_mask(i, tmp) { > load = source_load(i); > > if (load > max_load) { > max_load = load; > busiest = cpu_rq(i); > } > } > > return busiest; >} > >/* > * Check this_cpu to ensure it is balanced within domain. Attempt to move > * tasks if there is an imbalance. > * > * Called with this_rq unlocked. > */ >static int load_balance(int this_cpu, runqueue_t *this_rq, > struct sched_domain *sd, enum idle_type idle) >{ > struct sched_group *group; > runqueue_t *busiest; > unsigned long imbalance; > int nr_moved; > > spin_lock(&this_rq->lock); > > group = find_busiest_group(sd, this_cpu, &imbalance, idle); > if (!group) > goto out_balanced; > > busiest = find_busiest_queue(group); > if (!busiest) > goto out_balanced; > /* > * This should be "impossible", but since load > * balancing is inherently racy and statistical, > * it could happen in theory. > */ > if (unlikely(busiest == this_rq)) { > WARN_ON(1); > goto out_balanced; > } > > nr_moved = 0; > if (busiest->nr_running > 1) { > /* > * Attempt to move tasks. If find_busiest_group has found > * an imbalance but busiest->nr_running <= 1, the group is > * still unbalanced. nr_moved simply stays zero, so it is > * correctly treated as an imbalance. > */ > double_lock_balance(this_rq, busiest); > nr_moved = move_tasks(this_rq, this_cpu, busiest, > imbalance, sd, idle); > spin_unlock(&busiest->lock); > } > spin_unlock(&this_rq->lock); > > if (!nr_moved) { > sd->nr_balance_failed++; > > if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { > int wake = 0; > > spin_lock(&busiest->lock); > if (!busiest->active_balance) { > busiest->active_balance = 1; > busiest->push_cpu = this_cpu; > wake = 1; > } > spin_unlock(&busiest->lock); > if (wake) > wake_up_process(busiest->migration_thread); > > /* > * We've kicked active balancing, reset the failure > * counter. > */ > sd->nr_balance_failed = sd->cache_nice_tries; > } > } else > sd->nr_balance_failed = 0; > > /* We were unbalanced, so reset the balancing interval */ > sd->balance_interval = sd->min_interval; > > return nr_moved; > >out_balanced: > spin_unlock(&this_rq->lock); > > /* tune up the balancing interval */ > if (sd->balance_interval < sd->max_interval) > sd->balance_interval *= 2; > > return 0; >} > >/* > * Check this_cpu to ensure it is balanced within domain. Attempt to move > * tasks if there is an imbalance. > * > * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). > * this_rq is locked. > */ >static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, > struct sched_domain *sd) >{ > struct sched_group *group; > runqueue_t *busiest = NULL; > unsigned long imbalance; > int nr_moved = 0; > > group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); > if (!group) > goto out; > > busiest = find_busiest_queue(group); > if (!busiest || busiest == this_rq) > goto out; > > /* Attempt to move tasks */ > double_lock_balance(this_rq, busiest); > > nr_moved = move_tasks(this_rq, this_cpu, busiest, > imbalance, sd, NEWLY_IDLE); > > spin_unlock(&busiest->lock); > >out: > return nr_moved; >} > >/* > * idle_balance is called by schedule() if this_cpu is about to become > * idle. Attempts to pull tasks from other CPUs. > */ >static inline void idle_balance(int this_cpu, runqueue_t *this_rq) >{ > struct sched_domain *sd; > > for_each_domain(this_cpu, sd) { > if (sd->flags & SD_BALANCE_NEWIDLE) { > if (load_balance_newidle(this_cpu, this_rq, sd)) { > /* We've pulled tasks over so stop searching */ > break; > } > } > } >} > >/* > * active_load_balance is run by migration threads. It pushes a running > * task off the cpu. It can be required to correctly have at least 1 task > * running on each physical CPU where possible, and not have a physical / > * logical imbalance. > * > * Called with busiest locked. > */ >static void active_load_balance(runqueue_t *busiest, int busiest_cpu) >{ > struct sched_domain *sd; > struct sched_group *group, *busy_group; > int i; > > if (busiest->nr_running <= 1) > return; > > for_each_domain(busiest_cpu, sd) > if (cpu_isset(busiest->push_cpu, sd->span)) > break; > if (!sd) { > WARN_ON(1); > return; > } > > group = sd->groups; > while (!cpu_isset(busiest_cpu, group->cpumask)) > group = group->next; > busy_group = group; > > group = sd->groups; > do { > cpumask_t tmp; > runqueue_t *rq; > int push_cpu = 0; > > if (group == busy_group) > goto next_group; > > cpus_and(tmp, group->cpumask, cpu_online_map); > if (!cpus_weight(tmp)) > goto next_group; > > for_each_cpu_mask(i, tmp) { > if (!idle_cpu(i)) > goto next_group; > push_cpu = i; > } > > rq = cpu_rq(push_cpu); > > /* > * This condition is "impossible", but since load > * balancing is inherently a bit racy and statistical, > * it can trigger.. Reported by Bjorn Helgaas on a > * 128-cpu setup. > */ > if (unlikely(busiest == rq)) > goto next_group; > double_lock_balance(busiest, rq); > move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); > spin_unlock(&rq->lock); >next_group: > group = group->next; > } while (group != sd->groups); >} > >/* > * rebalance_tick will get called every timer tick, on every CPU. > * unless the current task is SCHED_FIFO > * > * It checks each scheduling domain to see if it is due to be balanced, > * and initiates a balancing operation if so. > * > * Balancing parameters are set up in arch_init_sched_domains. > */ > >/* Don't have all balancing operations going off at once */ >#define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) > >static void rebalance_tick(int this_cpu, runqueue_t *this_rq, > enum idle_type idle) >{ > unsigned long old_load, this_load; > unsigned long j = jiffies + CPU_OFFSET(this_cpu); > struct sched_domain *sd; > > /* Update our load */ > old_load = this_rq->cpu_load; > this_load = this_rq->nr_running * SCHED_LOAD_SCALE; > /* > * Round up the averaging division if load is increasing. This > * prevents us from getting stuck on 9 if the load is 10, for > * example. > */ > if (this_load > old_load) > old_load++; > this_rq->cpu_load = (old_load + this_load) / 2; > > for_each_domain(this_cpu, sd) { > unsigned long interval = sd->balance_interval; > > if (idle != IDLE) > interval *= sd->busy_factor; > > /* scale ms to jiffies */ > interval = msecs_to_jiffies(interval); > if (unlikely(!interval)) > interval = 1; > > if (j - sd->last_balance >= interval) { > if (load_balance(this_cpu, this_rq, sd, idle)) { > /* We've pulled tasks over so no longer idle */ > idle = NOT_IDLE; > } > sd->last_balance += interval; > } > } >} > >static inline int needs_idle_balance(const runqueue_t *rq) >{ > return rq->nr_running == 0; >} >#else >/* > * on UP we do not need to balance between CPUs: > */ >static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) >{ >} >static inline void idle_balance(int cpu, runqueue_t *rq) >{ >} >static inline int needs_idle_balance(const runqueue_t *rq) >{ > return 0; >} >#endif > >static inline int wake_priority_sleeper(runqueue_t *rq) >{ >#ifdef CONFIG_SCHED_SMT > /* > * If an SMT sibling task has been put to sleep for priority > * reasons reschedule the idle task to see if it can now run. > */ > if (rq->nr_running) { > resched_task(rq->idle); > return 1; > } >#endif > return 0; >} > >/* > * Are promotions due? > */ >static inline int promotions_due(const runqueue_t *rq) >{ > return time_after_eq(jiffies, rq->next_prom_due); >} > >/* > * Assume runqueue lock is NOT already held. > * This is not executed when current task is SCHED_FIFO > */ >static void do_promotions(runqueue_t *rq) >{ > int idx = MAX_RT_PRIO; > > spin_lock(&rq->lock); > for (;;) { > int new_prio; > idx = find_next_bit(rq->bitmap, IDLE_PRIO, idx + 1); > if (idx > (IDLE_PRIO - 1)) > break; > > new_prio = idx - 1; > __list_splice(&rq->queues[idx].queue, rq->queues[new_prio].queue.prev); > INIT_LIST_HEAD(&rq->queues[idx].queue); > __clear_bit(idx, rq->bitmap); > __set_bit(new_prio, rq->bitmap); > /* > * If promotion occurs from the slot > * associated with rq->current_prio_slot then the > * current task will be one of those promoted > * so we should update rq->current_prio_slot > * This will only be true for at most one slot. > */ > if (unlikely(idx == rq->current_prio_slot->prio)) > rq->current_prio_slot = rq->queues + new_prio; > } > rq->next_prom_due = (jiffies + get_prom_interval(rq)); > spin_unlock(&rq->lock); >} > >DEFINE_PER_CPU(struct kernel_stat, kstat); > >EXPORT_PER_CPU_SYMBOL(kstat); > >/* > * This function gets called by the timer code, with HZ frequency. > * We call it with interrupts disabled. > * > * It also gets called by the fork code, when changing the parent's > * timeslices. > */ >void scheduler_tick(int user_ticks, int sys_ticks) >{ > int cpu = smp_processor_id(); > struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; > runqueue_t *rq = this_rq(); > task_t *p = current; > > rq->timestamp_last_tick = sched_clock(); > > if (rcu_pending(cpu)) > rcu_check_callbacks(cpu, user_ticks); > > /* note: this timer irq context must be accounted for as well */ > if (hardirq_count() - HARDIRQ_OFFSET) { > cpustat->irq += sys_ticks; > sys_ticks = 0; > } else if (softirq_count()) { > cpustat->softirq += sys_ticks; > sys_ticks = 0; > } > > if (p == rq->idle) { > if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) { > spin_lock(&rq->lock); > if (!--rq->eb_ticks_to_decay) > decay_eb_yardstick(rq); > spin_unlock(&rq->lock); > } > if (atomic_read(&rq->nr_iowait) > 0) > cpustat->iowait += sys_ticks; > else > cpustat->idle += sys_ticks; > if (wake_priority_sleeper(rq)) > goto out; > rebalance_tick(cpu, rq, IDLE); > return; > } > if (TASK_NICE(p) > 0) > cpustat->nice += user_ticks; > else > cpustat->user += user_ticks; > cpustat->system += sys_ticks; > > /* > * SCHED_FIFO tasks never run out of timeslice. > * and should not be burdened with the overhead of promotion or > * a tick rebalance > */ > if (unlikely(p->policy == SCHED_FIFO)) > return; > spin_lock(&rq->lock); > rq->cache_ticks++; > if (sched_mode == SCHED_MODE_STAIRCASE) > goto sched_staircase; > if ((sched_mode == SCHED_MODE_ENTITLEMENT_BASED) && (!--rq->eb_ticks_to_decay)) > decay_eb_yardstick(rq); > /* > * The task was running during this tick - update the > * time slice counter. Note: we do not update a thread's > * priority until it either goes to sleep or uses up its > * timeslice. > */ > if (unlikely(p->policy == SCHED_RR)) { > /* > * RR tasks need a special form of timeslice management. > */ > if (!--p->time_slice) { > p->time_slice = task_timeslice(p); > set_tsk_need_resched(p); > > /* put it at the end of the queue with a minimum of fuss > */ > list_del_init(&p->run_list); > list_add_tail(&p->run_list, &rq->current_prio_slot->queue); > } > goto out_unlock; > } > if (!--p->time_slice) { > unsigned long long delta; > > dequeue_task(p); > set_tsk_need_resched(p); > p->time_slice = task_timeslice(p); > apply_sched_avg_decay(&p->avg_delay_per_sub_cycle); > apply_sched_avg_decay(&p->avg_cpu_per_sub_cycle); > delta = (rq->timestamp_last_tick - p->sched_timestamp); > p->sub_cycle_count++; > p->avg_cpu_per_cycle += delta; > p->avg_cpu_per_sub_cycle += delta; > p->total_cpu += delta; > p->sched_timestamp = rq->timestamp_last_tick; > calculate_rates(p); > recalc_throughput_bonus(p, rq->nr_running); > reassess_cpu_boundness(p); > /* > * Arguably the interactive bonus should be updated here > * as well. But it depends on whether we wish to encourage > * interactive tasks to maintain a high bonus or CPU bound > * tasks to lose some of there bonus? > */ > if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) > calculate_eb_priority(p, rq); > rq->current_prio_slot = rq->queues + effective_prio(p); > enqueue_task(p, rq, rq->current_prio_slot->prio); > goto out_unlock; > } > if (task_should_be_yardstick(p, rq)) > set_eb_yardstick(rq, p); > goto check_preempt; >sched_staircase: > if (unlikely(p->policy == SCHED_RR)) { > /* > * RR tasks need a special form of timeslice management. > */ > if (!--p->time_slice) { > p->time_slice = RR_INTERVAL(); > set_tsk_need_resched(p); > > /* put it at the end of the queue with a minimum of fuss > */ > list_del_init(&p->run_list); > list_add_tail(&p->run_list, &rq->current_prio_slot->queue); > } > goto out_unlock; > } > /* > * Tasks lose burst each time they use up a full slice(). > */ > if (!--p->slice) { > set_tsk_need_resched(p); > dequeue_task(p); > dec_burst(p); > p->slice = slice(p); > rq->current_prio_slot = rq->queues + effective_prio(p); > p->time_slice = RR_INTERVAL(); > enqueue_task(p, rq, rq->current_prio_slot->prio); > goto out_unlock; > } > /* > * Tasks that run out of time_slice but still have slice left get > * requeued with a lower priority && RR_INTERVAL time_slice. > */ > if (!--p->time_slice) { > dequeue_task(p); > set_tsk_need_resched(p); > p->time_slice = RR_INTERVAL(); > rq->current_prio_slot = rq->queues + effective_prio(p); > enqueue_task(p, rq, rq->current_prio_slot->prio); > goto out_unlock; > } >check_preempt: > if (rq->preempted && rq->cache_ticks >= cache_decay_ticks) > set_tsk_need_resched(p); >out_unlock: > spin_unlock(&rq->lock); >out: > rebalance_tick(cpu, rq, NOT_IDLE); > if (unlikely(promotions_due(rq))) { > /* > * If there's less than 2 SCHED_OTHER tasks defer the next promotion > */ > if ((rt_task(p) ? rq->nr_running - 1 : rq->nr_running) < 2) > rq->next_prom_due = (jiffies + get_prom_interval(rq)); > else > do_promotions(rq); > } >} > >#ifdef CONFIG_SCHED_SMT >static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) >{ > int i; > struct sched_domain *sd = rq->sd; > cpumask_t sibling_map; > > if (!(sd->flags & SD_SHARE_CPUPOWER)) > return; > > cpus_and(sibling_map, sd->span, cpu_online_map); > for_each_cpu_mask(i, sibling_map) { > runqueue_t *smt_rq; > > if (i == cpu) > continue; > > smt_rq = cpu_rq(i); > > /* > * If an SMT sibling task is sleeping due to priority > * reasons wake it up now. > */ > if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) > resched_task(smt_rq->idle); > } >} > >static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) >{ > struct sched_domain *sd = rq->sd; > cpumask_t sibling_map; > int ret = 0, i; > > if (!(sd->flags & SD_SHARE_CPUPOWER)) > return 0; > > cpus_and(sibling_map, sd->span, cpu_online_map); > for_each_cpu_mask(i, sibling_map) { > runqueue_t *smt_rq; > task_t *smt_curr; > > if (i == cpu) > continue; > > smt_rq = cpu_rq(i); > smt_curr = smt_rq->curr; > > /* > * If a user task with lower static priority than the > * running task on the SMT sibling is trying to schedule, > * delay it till there is proportionately less timeslice > * left of the sibling task to prevent a lower priority > * task from using an unfair proportion of the > * physical cpu's resources. -ck > */ > if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > > task_timeslice(p) || rt_task(smt_curr)) && > p->mm && smt_curr->mm && !rt_task(p)) > ret = 1; > > /* > * Reschedule a lower priority task on the SMT sibling, > * or wake it up if it has been put to sleep for priority > * reasons. > */ > if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > > task_timeslice(smt_curr) || rt_task(p)) && > smt_curr->mm && p->mm && !rt_task(smt_curr)) || > (smt_curr == smt_rq->idle && smt_rq->nr_running)) > resched_task(smt_curr); > } > return ret; >} > >static inline int dependent_idle(const runqueue_t *rq, const task_t *p) >{ > return p == rq->idle; >} >#else >static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) >{ >} > >static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) >{ > return 0; >} > >static inline int dependent_idle(const runqueue_t *rq, const task_t *p) >{ > return 0; >} >#endif > >/* > * schedule() is the main scheduler function. > */ >asmlinkage void __sched schedule(void) >{ > long *switch_count; > task_t *prev, *next; > runqueue_t *rq; > int cpu; > unsigned long long delta; > > /* > * Test if we are atomic. Since do_exit() needs to call into > * schedule() atomically, we ignore that path for now. > * Otherwise, whine if we are scheduling when we should not be. > */ > if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { > if (unlikely(in_atomic())) { > printk(KERN_ERR "bad: scheduling while atomic!\n"); > dump_stack(); > } > } > >need_resched: > preempt_disable(); > prev = current; > rq = this_rq(); > > release_kernel_lock(prev); > > spin_lock_irq(&rq->lock); > > rq->timestamp_last_tick = sched_clock(); > prev->runtime = rq->timestamp_last_tick - prev->timestamp; > > /* > * if entering off of a kernel preemption go straight > * to picking the next task. > */ > switch_count = &prev->nivcsw; > if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { > switch_count = &prev->nvcsw; > if (unlikely((prev->state & TASK_INTERRUPTIBLE) && > unlikely(signal_pending(prev)))) > prev->state = TASK_RUNNING; > else > deactivate_task(prev, rq); > } > > cpu = smp_processor_id(); > if (unlikely(needs_idle_balance(rq))) > idle_balance(cpu, rq); > > rq->current_prio_slot = rq->queues + sched_find_first_bit(rq->bitmap); > next = list_entry(rq->current_prio_slot->queue.next, task_t, run_list); > if (dependent_idle(rq, next)) { > wake_sleeping_dependent(cpu, rq); > goto switch_tasks; > } > > if (dependent_sleeper(cpu, rq, next)) { > rq->current_prio_slot = rq->queues + IDLE_PRIO; > next = rq->idle; > } >switch_tasks: > prefetch(next); > clear_tsk_need_resched(prev); > RCU_qsctr(task_cpu(prev))++; > > /* > * Update estimate of average CPU time used per cycle > */ > delta = (rq->timestamp_last_tick - prev->sched_timestamp); > prev->avg_cpu_per_cycle += delta; > prev->avg_cpu_per_sub_cycle += delta; > prev->total_cpu += delta; > prev->timestamp = prev->sched_timestamp = rq->timestamp_last_tick; > if (unlikely(next->flags & PF_YIELDED)) { > next->flags &= ~PF_YIELDED; > dequeue_task(next); > rq->current_prio_slot = rq->queues + effective_prio(next); > enqueue_task_head(next, rq, rq->current_prio_slot->prio); > } > > if (likely(prev != next)) { > rq->preempted = 0; > rq->cache_ticks = 0; > /* > * Update estimate of average delay on run queue per cycle > */ > delta = (rq->timestamp_last_tick - next->sched_timestamp); > next->avg_delay_per_cycle += delta; > next->avg_delay_per_sub_cycle += delta; > next->total_delay += delta; > next->timestamp = next->sched_timestamp = rq->timestamp_last_tick; > rq->total_delay += delta; > rq->nr_switches++; > rq->curr = next; > ++*switch_count; > > prepare_arch_switch(rq, next); > prev = context_switch(rq, prev, next); > barrier(); > > finish_task_switch(prev); > } else > spin_unlock_irq(&rq->lock); > > reacquire_kernel_lock(current); > preempt_enable_no_resched(); > if (test_thread_flag(TIF_NEED_RESCHED)) > goto need_resched; >} > >EXPORT_SYMBOL(schedule); > >#ifdef CONFIG_PREEMPT >/* > * this is is the entry point to schedule() from in-kernel preemption > * off of preempt_enable. Kernel preemptions off return from interrupt > * occur there and call schedule directly. > */ >asmlinkage void __sched preempt_schedule(void) >{ > struct thread_info *ti = current_thread_info(); > > /* > * If there is a non-zero preempt_count or interrupts are disabled, > * we do not want to preempt the current task. Just return.. > */ > if (unlikely(ti->preempt_count || irqs_disabled())) > return; > >need_resched: > ti->preempt_count = PREEMPT_ACTIVE; > schedule(); > ti->preempt_count = 0; > > /* we could miss a preemption opportunity between schedule and now */ > barrier(); > if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) > goto need_resched; >} > >EXPORT_SYMBOL(preempt_schedule); >#endif /* CONFIG_PREEMPT */ > >int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) >{ > task_t *p = curr->task; > return try_to_wake_up(p, mode, sync); >} > >EXPORT_SYMBOL(default_wake_function); > >/* > * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just > * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve > * number) then we wake all the non-exclusive tasks and one exclusive task. > * > * There are circumstances in which we can try to wake a task which has already > * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns > * zero in this (rare) case, and we handle it by continuing to scan the queue. > */ >static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, > int nr_exclusive, int sync, void *key) >{ > struct list_head *tmp, *next; > > list_for_each_safe(tmp, next, &q->task_list) { > wait_queue_t *curr; > unsigned flags; > curr = list_entry(tmp, wait_queue_t, task_list); > flags = curr->flags; > if (curr->func(curr, mode, sync, key) && > (flags & WQ_FLAG_EXCLUSIVE) && > !--nr_exclusive) > break; > } >} > >/** > * __wake_up - wake up threads blocked on a waitqueue. > * @q: the waitqueue > * @mode: which threads > * @nr_exclusive: how many wake-one or wake-many threads to wake up > */ >void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, > int nr_exclusive, void *key) >{ > unsigned long flags; > > spin_lock_irqsave(&q->lock, flags); > __wake_up_common(q, mode, nr_exclusive, 0, key); > spin_unlock_irqrestore(&q->lock, flags); >} > >EXPORT_SYMBOL(__wake_up); > >/* > * Same as __wake_up but called with the spinlock in wait_queue_head_t held. > */ >void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) >{ > __wake_up_common(q, mode, 1, 0, NULL); >} > >/** > * __wake_up - sync- wake up threads blocked on a waitqueue. > * @q: the waitqueue > * @mode: which threads > * @nr_exclusive: how many wake-one or wake-many threads to wake up > * > * The sync wakeup differs that the waker knows that it will schedule > * away soon, so while the target thread will be woken up, it will not > * be migrated to another CPU - ie. the two threads are 'synchronized' > * with each other. This can prevent needless bouncing between CPUs. > * > * On UP it can prevent extra preemption. > */ >void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) >{ > unsigned long flags; > int sync = 1; > > if (unlikely(!q)) > return; > > if (unlikely(!nr_exclusive)) > sync = 0; > > spin_lock_irqsave(&q->lock, flags); > __wake_up_common(q, mode, nr_exclusive, sync, NULL); > spin_unlock_irqrestore(&q->lock, flags); >} >EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ > >void fastcall complete(struct completion *x) >{ > unsigned long flags; > > spin_lock_irqsave(&x->wait.lock, flags); > x->done++; > __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, > 1, 0, NULL); > spin_unlock_irqrestore(&x->wait.lock, flags); >} >EXPORT_SYMBOL(complete); > >void fastcall complete_all(struct completion *x) >{ > unsigned long flags; > > spin_lock_irqsave(&x->wait.lock, flags); > x->done += UINT_MAX/2; > __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, > 0, 0, NULL); > spin_unlock_irqrestore(&x->wait.lock, flags); >} >EXPORT_SYMBOL(complete_all); > >void fastcall __sched wait_for_completion(struct completion *x) >{ > might_sleep(); > spin_lock_irq(&x->wait.lock); > if (!x->done) { > DECLARE_WAITQUEUE(wait, current); > > wait.flags |= WQ_FLAG_EXCLUSIVE; > __add_wait_queue_tail(&x->wait, &wait); > do { > __set_current_state(TASK_UNINTERRUPTIBLE); > spin_unlock_irq(&x->wait.lock); > schedule(); > spin_lock_irq(&x->wait.lock); > } while (!x->done); > __remove_wait_queue(&x->wait, &wait); > } > x->done--; > spin_unlock_irq(&x->wait.lock); >} >EXPORT_SYMBOL(wait_for_completion); > >#define SLEEP_ON_VAR \ > unsigned long flags; \ > wait_queue_t wait; \ > init_waitqueue_entry(&wait, current); > >#define SLEEP_ON_HEAD \ > spin_lock_irqsave(&q->lock,flags); \ > __add_wait_queue(q, &wait); \ > spin_unlock(&q->lock); > >#define SLEEP_ON_TAIL \ > spin_lock_irq(&q->lock); \ > __remove_wait_queue(q, &wait); \ > spin_unlock_irqrestore(&q->lock, flags); > >void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) >{ > SLEEP_ON_VAR > > current->state = TASK_INTERRUPTIBLE; > > SLEEP_ON_HEAD > schedule(); > SLEEP_ON_TAIL >} > >EXPORT_SYMBOL(interruptible_sleep_on); > >long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) >{ > SLEEP_ON_VAR > > current->state = TASK_INTERRUPTIBLE; > > SLEEP_ON_HEAD > timeout = schedule_timeout(timeout); > SLEEP_ON_TAIL > > return timeout; >} > >EXPORT_SYMBOL(interruptible_sleep_on_timeout); > >void fastcall __sched sleep_on(wait_queue_head_t *q) >{ > SLEEP_ON_VAR > > current->state = TASK_UNINTERRUPTIBLE; > > SLEEP_ON_HEAD > schedule(); > SLEEP_ON_TAIL >} > >EXPORT_SYMBOL(sleep_on); > >long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) >{ > SLEEP_ON_VAR > > current->state = TASK_UNINTERRUPTIBLE; > > SLEEP_ON_HEAD > timeout = schedule_timeout(timeout); > SLEEP_ON_TAIL > > return timeout; >} > >EXPORT_SYMBOL(sleep_on_timeout); > >void set_user_nice(task_t *p, long nice) >{ > unsigned long flags; > runqueue_t *rq; > int queued, delta; > > if (TASK_NICE(p) == nice || nice < -20 || nice > 19) > return; > /* > * We have to be careful, if called from sys_setpriority(), > * the task might be in the middle of scheduling on another CPU. > */ > rq = task_rq_lock(p, &flags); > /* > * The RT priorities are set via setscheduler(), but we still > * allow the 'normal' nice value to be set - but as expected > * it wont have any effect on scheduling until the task is > * not SCHED_NORMAL: > */ > if ((queued = (!rt_task(p) && task_queued(p)))) > dequeue_task(p); > > delta = PRIO_TO_NICE(p->static_prio) - nice; > p->static_prio = NICE_TO_PRIO(nice); > p->eb_shares = nice_to_shares(nice); > > if (queued) { > int new_prio = effective_prio(p); > > enqueue_task(p, rq, new_prio); > if (task_running(rq, p)) > rq->current_prio_slot = rq->queues + new_prio; > > /* > * If the task increased its setting or is running and lowered > * its setting, then reschedule its CPU: > */ > if ((delta > 0) || ((delta < 0) && task_running(rq, p))) > resched_task(rq->curr); > } > > task_rq_unlock(rq, &flags); >} > >EXPORT_SYMBOL(set_user_nice); > >#ifdef __ARCH_WANT_SYS_NICE > >/* > * sys_nice - change the priority of the current process. > * @increment: priority increment > * > * sys_setpriority is a more generic, but much slower function that > * does similar things. > */ >asmlinkage long sys_nice(int increment) >{ > int retval; > long nice; > > /* > * Setpriority might change our priority at the same moment. > * We don't have to worry. Conceptually one call occurs first > * and we have a single winner. > */ > if (increment < 0) { > if (!capable(CAP_SYS_NICE)) > return -EPERM; > if (increment < -40) > increment = -40; > } > if (increment > 40) > increment = 40; > > nice = PRIO_TO_NICE(current->static_prio) + increment; > if (nice < -20) > nice = -20; > if (nice > 19) > nice = 19; > > retval = security_task_setnice(current, nice); > if (retval) > return retval; > > set_user_nice(current, nice); > return 0; >} > >#endif > >/** > * task_prio - return the priority value of a given task. > * @p: the task in question. > * > * This is the priority value as seen by users in /proc. > * RT tasks are offset by -200. Normal tasks are centered > * around 0, value goes from -16 to +15. > */ >int task_prio(task_t *p) >{ > return effective_prio(p) - MAX_RT_PRIO; >} > >/** > * task_nice - return the nice value of a given task. > * @p: the task in question. > */ >int task_nice(task_t *p) >{ > return TASK_NICE(p); >} > >EXPORT_SYMBOL(task_nice); > >/** > * idle_cpu - is a given cpu idle currently? > * @cpu: the processor in question. > */ >int idle_cpu(int cpu) >{ > return cpu_curr(cpu) == cpu_rq(cpu)->idle; >} > >EXPORT_SYMBOL_GPL(idle_cpu); > >/** > * find_process_by_pid - find a process with a matching PID value. > * @pid: the pid in question. > */ >static inline task_t *find_process_by_pid(pid_t pid) >{ > return pid ? find_task_by_pid(pid) : current; >} > >/* Actually do priority change: must hold rq lock. */ >static void __setscheduler(struct task_struct *p, int policy, int prio) >{ > BUG_ON(task_queued(p)); > p->policy = policy; > p->rt_priority = prio; >} > >/* > * setscheduler - change the scheduling policy and/or RT priority of a thread. > */ >static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) >{ > struct sched_param lp; > int retval = -EINVAL; > int queued; > unsigned long flags; > runqueue_t *rq; > task_t *p; > > if (!param || pid < 0) > goto out_nounlock; > > retval = -EFAULT; > if (copy_from_user(&lp, param, sizeof(struct sched_param))) > goto out_nounlock; > > /* > * We play safe to avoid deadlocks. > */ > read_lock_irq(&tasklist_lock); > > p = find_process_by_pid(pid); > > retval = -ESRCH; > if (!p) > goto out_unlock_tasklist; > > /* > * To be able to change p->policy safely, the apropriate > * runqueue lock must be held. > */ > rq = task_rq_lock(p, &flags); > > if (policy < 0) > policy = p->policy; > else { > retval = -EINVAL; > if (policy != SCHED_FIFO && policy != SCHED_RR && > policy != SCHED_NORMAL) > goto out_unlock; > } > > /* > * Valid priorities for SCHED_FIFO and SCHED_RR are > * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. > */ > retval = -EINVAL; > if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) > goto out_unlock; > if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) > goto out_unlock; > > retval = -EPERM; > if ((policy == SCHED_FIFO || policy == SCHED_RR) && > !capable(CAP_SYS_NICE)) > goto out_unlock; > if ((current->euid != p->euid) && (current->euid != p->uid) && > !capable(CAP_SYS_NICE)) > goto out_unlock; > > retval = security_task_setscheduler(p, policy, &lp); > if (retval) > goto out_unlock; > > if ((queued = task_queued(p))) > deactivate_task(p, task_rq(p)); > retval = 0; > __setscheduler(p, policy, lp.sched_priority); > if (queued) { > int prio = effective_prio(p); > > __activate_task(p, task_rq(p), prio); > /* > * Reschedule if we are currently running on this runqueue and > * our priority decreased, or if we are not currently running on > * this runqueue and our priority is higher than the current's > */ > if (preemption_warranted(prio, p, rq)) > resched_task(rq->curr); > if (task_running(rq, p)) > rq->current_prio_slot = rq->queues + prio; > } > >out_unlock: > task_rq_unlock(rq, &flags); >out_unlock_tasklist: > read_unlock_irq(&tasklist_lock); > >out_nounlock: > return retval; >} > >/** > * sys_sched_setscheduler - set/change the scheduler policy and RT priority > * @pid: the pid in question. > * @policy: new policy > * @param: structure containing the new RT priority. > */ >asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, > struct sched_param __user *param) >{ > return setscheduler(pid, policy, param); >} > >/** > * sys_sched_setparam - set/change the RT priority of a thread > * @pid: the pid in question. > * @param: structure containing the new RT priority. > */ >asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) >{ > return setscheduler(pid, -1, param); >} > >/** > * sys_sched_getscheduler - get the policy (scheduling class) of a thread > * @pid: the pid in question. > */ >asmlinkage long sys_sched_getscheduler(pid_t pid) >{ > int retval = -EINVAL; > task_t *p; > > if (pid < 0) > goto out_nounlock; > > retval = -ESRCH; > read_lock(&tasklist_lock); > p = find_process_by_pid(pid); > if (p) { > retval = security_task_getscheduler(p); > if (!retval) > retval = p->policy; > } > read_unlock(&tasklist_lock); > >out_nounlock: > return retval; >} > >/** > * sys_sched_getscheduler - get the RT priority of a thread > * @pid: the pid in question. > * @param: structure containing the RT priority. > */ >asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) >{ > struct sched_param lp; > int retval = -EINVAL; > task_t *p; > > if (!param || pid < 0) > goto out_nounlock; > > read_lock(&tasklist_lock); > p = find_process_by_pid(pid); > retval = -ESRCH; > if (!p) > goto out_unlock; > > retval = security_task_getscheduler(p); > if (retval) > goto out_unlock; > > lp.sched_priority = p->rt_priority; > read_unlock(&tasklist_lock); > > /* > * This one might sleep, we cannot do it with a spinlock held ... > */ > retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; > >out_nounlock: > return retval; > >out_unlock: > read_unlock(&tasklist_lock); > return retval; >} > >/** > * sys_sched_setaffinity - set the cpu affinity of a process > * @pid: pid of the process > * @len: length in bytes of the bitmask pointed to by user_mask_ptr > * @user_mask_ptr: user-space pointer to the new cpu mask > */ >asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, > unsigned long __user *user_mask_ptr) >{ > cpumask_t new_mask; > int retval; > task_t *p; > > if (len < sizeof(new_mask)) > return -EINVAL; > > if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) > return -EFAULT; > > lock_cpu_hotplug(); > read_lock(&tasklist_lock); > > p = find_process_by_pid(pid); > if (!p) { > read_unlock(&tasklist_lock); > unlock_cpu_hotplug(); > return -ESRCH; > } > > /* > * It is not safe to call set_cpus_allowed with the > * tasklist_lock held. We will bump the task_struct's > * usage count and then drop tasklist_lock. > */ > get_task_struct(p); > read_unlock(&tasklist_lock); > > retval = -EPERM; > if ((current->euid != p->euid) && (current->euid != p->uid) && > !capable(CAP_SYS_NICE)) > goto out_unlock; > > retval = set_cpus_allowed(p, new_mask); > >out_unlock: > put_task_struct(p); > unlock_cpu_hotplug(); > return retval; >} > >/** > * sys_sched_getaffinity - get the cpu affinity of a process > * @pid: pid of the process > * @len: length in bytes of the bitmask pointed to by user_mask_ptr > * @user_mask_ptr: user-space pointer to hold the current cpu mask > */ >asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, > unsigned long __user *user_mask_ptr) >{ > unsigned int real_len; > cpumask_t mask; > int retval; > task_t *p; > > real_len = sizeof(mask); > if (len < real_len) > return -EINVAL; > > lock_cpu_hotplug(); > read_lock(&tasklist_lock); > > retval = -ESRCH; > p = find_process_by_pid(pid); > if (!p) > goto out_unlock; > > retval = 0; > cpus_and(mask, p->cpus_allowed, cpu_possible_map); > >out_unlock: > read_unlock(&tasklist_lock); > unlock_cpu_hotplug(); > if (retval) > return retval; > if (copy_to_user(user_mask_ptr, &mask, real_len)) > return -EFAULT; > return real_len; >} > >void get_task_sched_stats(const struct task_struct *tsk, struct task_sched_stats *stats) >{ > int on_runq = 0; > int on_cpu = 0; > unsigned long long timestamp; > runqueue_t *rq = this_rq_lock(); > > stats->timestamp = rq->timestamp_last_tick; > stats->avg_sleep_per_cycle = tsk->avg_sleep_per_cycle; > stats->avg_delay_per_cycle = tsk->avg_delay_per_cycle; > stats->avg_cpu_per_cycle = tsk->avg_cpu_per_cycle; > stats->cycle_count = tsk->cycle_count; > stats->total_sleep = tsk->total_sleep; > stats->total_cpu = tsk->total_cpu; > stats->total_delay = tsk->total_delay; > timestamp = tsk->sched_timestamp; > if ((on_runq = task_queued(tsk))) > on_cpu = rq->idle == tsk; > > rq_unlock(rq); > > /* > * Update values to the previous tick (only) > */ > if (stats->timestamp > timestamp) { > unsigned long long delta = stats->timestamp - timestamp; > > if (on_cpu) { > stats->avg_cpu_per_cycle += delta; > stats->total_cpu += delta; > } else if (on_runq) { > stats->avg_delay_per_cycle += delta; > stats->total_delay += delta; > } else { > stats->avg_sleep_per_cycle += delta; > stats->total_sleep += delta; > } > } > /* > * Convert internal "real number" representation of average times > * to integer values in nanoseconds > */ > stats->avg_sleep_per_cycle = SCHED_AVG_RND(stats->avg_sleep_per_cycle); > stats->avg_cpu_per_cycle = SCHED_AVG_RND(stats->avg_cpu_per_cycle); > stats->avg_delay_per_cycle = SCHED_AVG_RND(stats->avg_delay_per_cycle); >} > >EXPORT_SYMBOL(get_task_sched_stats); > >/* > * Get scheduling statistics for the nominated CPU > */ >void get_cpu_sched_stats(unsigned int cpu, struct cpu_sched_stats *stats) >{ > int idle; > unsigned long long idle_timestamp; > runqueue_t *rq = cpu_rq(cpu); > > /* > * No need to crash the whole machine if they've asked for stats for > * a non existent CPU, just send back zero. > */ > if (rq == NULL) { > stats->timestamp = 0; > stats->total_idle = 0; > stats->total_busy = 0; > stats->total_delay = 0; > stats->nr_switches = 0; > > return; > } > local_irq_disable(); > spin_lock(&rq->lock); > idle = rq->curr == rq->idle; > stats->timestamp = rq->timestamp_last_tick; > idle_timestamp = rq->idle->sched_timestamp; > stats->total_idle = rq->idle->total_cpu; > stats->total_busy = rq->idle->total_delay; > stats->total_delay = rq->total_delay; > stats->nr_switches = rq->nr_switches; > rq_unlock(rq); > > /* > * Update idle/busy time to the current tick > */ > if (idle) > stats->total_idle += (stats->timestamp - idle_timestamp); > else > stats->total_busy += (stats->timestamp - idle_timestamp); >} > >EXPORT_SYMBOL(get_cpu_sched_stats); > >/** > * sys_sched_yield - yield the current processor to other threads. > * > * CPU then this function will return. > */ >asmlinkage long sys_sched_yield(void) >{ > runqueue_t *rq = this_rq_lock(); > > if (sched_mode == SCHED_MODE_STAIRCASE) > goto yield_staircase; > /* > * (special rule: RT tasks will just roundrobin in the active > * array.) > */ > if (likely(!rt_task(current))) { > /* If there's other tasks on this CPU make sure that as many of > * them as possible/judicious get some CPU before this task > */ > dequeue_task(current); > current->flags |= PF_YIELDED; > rq->current_prio_slot = rq->queues + (IDLE_PRIO - 1); > enqueue_task(current, rq, rq->current_prio_slot->prio); > } else { > list_del_init(¤t->run_list); > list_add_tail(¤t->run_list, &rq->current_prio_slot->queue); > } > goto out; >yield_staircase: > dequeue_task(current); > current->slice = slice(current); > current->time_slice = RR_INTERVAL(); > if (!rt_task(current)) { > current->flags |= PF_YIELDED; > rq->current_prio_slot = rq->queues + MAX_PRIO - 1; > } > current->burst = 0; > enqueue_task(current, rq, rq->current_prio_slot->prio); >out: > /* > * Since we are going to call schedule() anyway, there's > * no need to preempt or enable interrupts: > */ > _raw_spin_unlock(&rq->lock); > preempt_enable_no_resched(); > > schedule(); > > return 0; >} > >void __sched __cond_resched(void) >{ > set_current_state(TASK_RUNNING); > schedule(); >} > >EXPORT_SYMBOL(__cond_resched); > >/** > * yield - yield the current processor to other threads. > * > * this is a shortcut for kernel-space yielding - it marks the > * thread runnable and calls sys_sched_yield(). > */ >void __sched yield(void) >{ > set_current_state(TASK_RUNNING); > sys_sched_yield(); >} > >EXPORT_SYMBOL(yield); > >/* > * This task is about to go to sleep on IO. Increment rq->nr_iowait so > * that process accounting knows that this is a task in IO wait state. > * > * But don't do that if it is a deliberate, throttling IO wait (this task > * has set its backing_dev_info: the queue against which it should throttle) > */ >void __sched io_schedule(void) >{ > struct runqueue *rq = this_rq(); > > atomic_inc(&rq->nr_iowait); > schedule(); > atomic_dec(&rq->nr_iowait); >} > >EXPORT_SYMBOL(io_schedule); > >long __sched io_schedule_timeout(long timeout) >{ > struct runqueue *rq = this_rq(); > long ret; > > atomic_inc(&rq->nr_iowait); > ret = schedule_timeout(timeout); > atomic_dec(&rq->nr_iowait); > return ret; >} > >/** > * sys_sched_get_priority_max - return maximum RT priority. > * @policy: scheduling class. > * > * this syscall returns the maximum rt_priority that can be used > * by a given scheduling class. > */ >asmlinkage long sys_sched_get_priority_max(int policy) >{ > int ret = -EINVAL; > > switch (policy) { > case SCHED_FIFO: > case SCHED_RR: > ret = MAX_USER_RT_PRIO-1; > break; > case SCHED_NORMAL: > ret = 0; > break; > } > return ret; >} > >/** > * sys_sched_get_priority_min - return minimum RT priority. > * @policy: scheduling class. > * > * this syscall returns the minimum rt_priority that can be used > * by a given scheduling class. > */ >asmlinkage long sys_sched_get_priority_min(int policy) >{ > int ret = -EINVAL; > > switch (policy) { > case SCHED_FIFO: > case SCHED_RR: > ret = 1; > break; > case SCHED_NORMAL: > ret = 0; > } > return ret; >} > >/** > * sys_sched_rr_get_interval - return the default timeslice of a process. > * @pid: pid of the process. > * @interval: userspace pointer to the timeslice value. > * > * this syscall writes the default timeslice value of a given process > * into the user-space timespec buffer. A value of '0' means infinity. > */ >asmlinkage >long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) >{ > int retval = -EINVAL; > struct timespec t; > task_t *p; > > if (pid < 0) > goto out_nounlock; > > retval = -ESRCH; > read_lock(&tasklist_lock); > p = find_process_by_pid(pid); > if (!p) > goto out_unlock; > > retval = security_task_getscheduler(p); > if (retval) > goto out_unlock; > > jiffies_to_timespec(p->policy & SCHED_FIFO ? > 0 : task_timeslice(p), &t); > read_unlock(&tasklist_lock); > retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; >out_nounlock: > return retval; >out_unlock: > read_unlock(&tasklist_lock); > return retval; >} > >static inline struct task_struct *eldest_child(struct task_struct *p) >{ > if (list_empty(&p->children)) return NULL; > return list_entry(p->children.next,struct task_struct,sibling); >} > >static inline struct task_struct *older_sibling(struct task_struct *p) >{ > if (p->sibling.prev==&p->parent->children) return NULL; > return list_entry(p->sibling.prev,struct task_struct,sibling); >} > >static inline struct task_struct *younger_sibling(struct task_struct *p) >{ > if (p->sibling.next==&p->parent->children) return NULL; > return list_entry(p->sibling.next,struct task_struct,sibling); >} > >static void show_task(task_t * p) >{ > task_t *relative; > unsigned state; > unsigned long free = 0; > static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" }; > > printk("%-13.13s ", p->comm); > state = p->state ? __ffs(p->state) + 1 : 0; > if (state < ARRAY_SIZE(stat_nam)) > printk(stat_nam[state]); > else > printk("?"); >#if (BITS_PER_LONG == 32) > if (state == TASK_RUNNING) > printk(" running "); > else > printk(" %08lX ", thread_saved_pc(p)); >#else > if (state == TASK_RUNNING) > printk(" running task "); > else > printk(" %016lx ", thread_saved_pc(p)); >#endif >#ifdef CONFIG_DEBUG_STACK_USAGE > { > unsigned long * n = (unsigned long *) (p->thread_info+1); > while (!*n) > n++; > free = (unsigned long) n - (unsigned long)(p->thread_info+1); > } >#endif > printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); > if ((relative = eldest_child(p))) > printk("%5d ", relative->pid); > else > printk(" "); > if ((relative = younger_sibling(p))) > printk("%7d", relative->pid); > else > printk(" "); > if ((relative = older_sibling(p))) > printk(" %5d", relative->pid); > else > printk(" "); > if (!p->mm) > printk(" (L-TLB)\n"); > else > printk(" (NOTLB)\n"); > > if (state != TASK_RUNNING) > show_stack(p, NULL); >} > >void show_state(void) >{ > task_t *g, *p; > >#if (BITS_PER_LONG == 32) > printk("\n" > " sibling\n"); > printk(" task PC pid father child younger older\n"); >#else > printk("\n" > " sibling\n"); > printk(" task PC pid father child younger older\n"); >#endif > read_lock(&tasklist_lock); > do_each_thread(g, p) { > /* > * reset the NMI-timeout, listing all files on a slow > * console might take alot of time: > */ > touch_nmi_watchdog(); > show_task(p); > } while_each_thread(g, p); > > read_unlock(&tasklist_lock); >} > >void __devinit init_idle(task_t *idle, int cpu) >{ > runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); > unsigned long flags; > > local_irq_save(flags); > double_rq_lock(idle_rq, rq); > > idle_rq->curr = idle_rq->idle = idle; > deactivate_task(idle, rq); > /* > * Initialize scheduling statistics counters as they may provide > * valuable about the CPU e.g. avg_cpu_time_per_cycle for the idle > * task will be an estimate of the average time the CPU is idle > */ > initialize_stats(idle); > initialize_bonuses(idle); > idle->sched_timestamp = rq->timestamp_last_tick; > idle->state = TASK_RUNNING; > idle->burst = 0; > set_task_cpu(idle, cpu); > /* > * Putting the idle process onto a run queue simplifies the selection of > * the next task to run in schedule(). > */ > list_add_tail(&idle->run_list, &idle_rq->queues[IDLE_PRIO].queue); > /* > * The idle task is the current task on idle_rq > */ > idle_rq->current_prio_slot = idle_rq->queues + IDLE_PRIO; > double_rq_unlock(idle_rq, rq); > set_tsk_need_resched(idle); > local_irq_restore(flags); > > /* Set the preempt count _outside_ the spinlocks! */ >#ifdef CONFIG_PREEMPT > idle->thread_info->preempt_count = (idle->lock_depth >= 0); >#else > idle->thread_info->preempt_count = 0; >#endif >} > >/* > * In a system that switches off the HZ timer nohz_cpu_mask > * indicates which cpus entered this state. This is used > * in the rcu update to wait only for active cpus. For system > * which do not switch off the HZ timer nohz_cpu_mask should > * always be CPU_MASK_NONE. > */ >cpumask_t nohz_cpu_mask = CPU_MASK_NONE; > >#ifdef CONFIG_SMP >/* > * This is how migration works: > * > * 1) we queue a migration_req_t structure in the source CPU's > * runqueue and wake up that CPU's migration thread. > * 2) we down() the locked semaphore => thread blocks. > * 3) migration thread wakes up (implicitly it forces the migrated > * thread off the CPU) > * 4) it gets the migration request and checks whether the migrated > * task is still in the wrong runqueue. > * 5) if it's in the wrong runqueue then the migration thread removes > * it and puts it into the right queue. > * 6) migration thread up()s the semaphore. > * 7) we wake up and the migration is done. > */ > >/* > * Change a given task's CPU affinity. Migrate the thread to a > * proper CPU and schedule it away if the CPU it's executing on > * is removed from the allowed bitmask. > * > * NOTE: the caller must have a valid reference to the task, the > * task must not exit() & deallocate itself prematurely. The > * call is not atomic; no spinlocks may be held. > */ >int set_cpus_allowed(task_t *p, cpumask_t new_mask) >{ > unsigned long flags; > int ret = 0; > migration_req_t req; > runqueue_t *rq; > > rq = task_rq_lock(p, &flags); > if (any_online_cpu(new_mask) == NR_CPUS) { > ret = -EINVAL; > goto out; > } > > p->cpus_allowed = new_mask; > /* Can the task run on the task's current CPU? If so, we're done */ > if (cpu_isset(task_cpu(p), new_mask)) > goto out; > > if (migrate_task(p, any_online_cpu(new_mask), &req)) { > /* Need help from migration thread: drop lock and wait. */ > task_rq_unlock(rq, &flags); > wake_up_process(rq->migration_thread); > wait_for_completion(&req.done); > return 0; > } >out: > task_rq_unlock(rq, &flags); > return ret; >} > >EXPORT_SYMBOL_GPL(set_cpus_allowed); > >/* > * Move (not current) task off this cpu, onto dest cpu. We're doing > * this because either it can't run here any more (set_cpus_allowed() > * away from this CPU, or CPU going down), or because we're > * attempting to rebalance this task on exec (sched_balance_exec). > * > * So we race with normal scheduler movements, but that's OK, as long > * as the task is no longer on this CPU. > */ >static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) >{ > runqueue_t *rq_dest, *rq_src; > > if (unlikely(cpu_is_offline(dest_cpu))) > return; > > rq_src = cpu_rq(src_cpu); > rq_dest = cpu_rq(dest_cpu); > > double_rq_lock(rq_src, rq_dest); > /* Already moved. */ > if (task_cpu(p) != src_cpu) > goto out; > /* Affinity changed (again). */ > if (!cpu_isset(dest_cpu, p->cpus_allowed)) > goto out; > > if (task_queued(p)) { > unsigned long long delta; > /* > * Sync timestamp with rq_dest's before activating. > * The same thing could be achieved by doing this step > * afterwards, and pretending it was a local activate. > * This way is cleaner and logically correct. > */ > p->timestamp = p->timestamp - rq_src->timestamp_last_tick > + rq_dest->timestamp_last_tick; > deactivate_task(p, rq_src); > /* > * Do set_task_cpu() until AFTER we dequeue the task, since > * dequeue_task() relies on task_cpu() always being accurate. > */ > set_task_cpu(p, dest_cpu); > delta = (rq_dest->timestamp_last_tick - p->sched_timestamp); > p->avg_delay_per_cycle += delta; > p->avg_delay_per_sub_cycle += delta; > p->total_delay += delta; > if (preemption_warranted(activate_task(p, rq_dest, 0), p, rq_dest)) > resched_task(rq_dest->curr); > } else { > unsigned long long delta; > > set_task_cpu(p, dest_cpu); > delta = (rq_dest->timestamp_last_tick - p->sched_timestamp); > p->avg_sleep_per_cycle += delta; > p->total_sleep += delta; > } > p->sched_timestamp = rq_dest->timestamp_last_tick; > >out: > double_rq_unlock(rq_src, rq_dest); >} > >/* > * migration_thread - this is a highprio system thread that performs > * thread migration by bumping thread off CPU then 'pushing' onto > * another runqueue. > */ >static int migration_thread(void * data) >{ > runqueue_t *rq; > int cpu = (long)data; > > rq = cpu_rq(cpu); > BUG_ON(rq->migration_thread != current); > > set_current_state(TASK_INTERRUPTIBLE); > while (!kthread_should_stop()) { > struct list_head *head; > migration_req_t *req; > > if (current->flags & PF_FREEZE) > refrigerator(PF_FREEZE); > > spin_lock_irq(&rq->lock); > > if (cpu_is_offline(cpu)) { > spin_unlock_irq(&rq->lock); > goto wait_to_die; > } > > if (rq->active_balance) { > active_load_balance(rq, cpu); > rq->active_balance = 0; > } > > head = &rq->migration_queue; > > if (list_empty(head)) { > spin_unlock_irq(&rq->lock); > schedule(); > set_current_state(TASK_INTERRUPTIBLE); > continue; > } > req = list_entry(head->next, migration_req_t, list); > list_del_init(head->next); > > if (req->type == REQ_MOVE_TASK) { > spin_unlock(&rq->lock); > __migrate_task(req->task, smp_processor_id(), > req->dest_cpu); > local_irq_enable(); > } else if (req->type == REQ_SET_DOMAIN) { > rq->sd = req->sd; > spin_unlock_irq(&rq->lock); > } else { > spin_unlock_irq(&rq->lock); > WARN_ON(1); > } > > complete(&req->done); > } > __set_current_state(TASK_RUNNING); > return 0; > >wait_to_die: > /* Wait for kthread_stop */ > set_current_state(TASK_INTERRUPTIBLE); > while (!kthread_should_stop()) { > schedule(); > set_current_state(TASK_INTERRUPTIBLE); > } > __set_current_state(TASK_RUNNING); > return 0; >} > >#ifdef CONFIG_HOTPLUG_CPU >/* migrate_all_tasks - function to migrate all tasks from the dead cpu. */ >static void migrate_all_tasks(int src_cpu) >{ > struct task_struct *tsk, *t; > int dest_cpu; > unsigned int node; > > write_lock_irq(&tasklist_lock); > > /* watch out for per node tasks, let's stay on this node */ > node = cpu_to_node(src_cpu); > > do_each_thread(t, tsk) { > cpumask_t mask; > if (tsk == current) > continue; > > if (task_cpu(tsk) != src_cpu) > continue; > > /* Figure out where this task should go (attempting to > * keep it on-node), and check if it can be migrated > * as-is. NOTE that kernel threads bound to more than > * one online cpu will be migrated. */ > mask = node_to_cpumask(node); > cpus_and(mask, mask, tsk->cpus_allowed); > dest_cpu = any_online_cpu(mask); > if (dest_cpu == NR_CPUS) > dest_cpu = any_online_cpu(tsk->cpus_allowed); > if (dest_cpu == NR_CPUS) { > cpus_clear(tsk->cpus_allowed); > cpus_complement(tsk->cpus_allowed); > dest_cpu = any_online_cpu(tsk->cpus_allowed); > > /* Don't tell them about moving exiting tasks > or kernel threads (both mm NULL), since > they never leave kernel. */ > if (tsk->mm && printk_ratelimit()) > printk(KERN_INFO "process %d (%s) no " > "longer affine to cpu%d\n", > tsk->pid, tsk->comm, src_cpu); > } > > __migrate_task(tsk, src_cpu, dest_cpu); > } while_each_thread(t, tsk); > > write_unlock_irq(&tasklist_lock); >} > >/* Schedules idle task to be the next runnable task on current CPU. > * It does so by boosting its priority to highest possible and adding it to > * the _front_ of runqueue. Used by CPU offline code. > */ >void sched_idle_next(void) >{ > int cpu = smp_processor_id(); > runqueue_t *rq = this_rq(); > struct task_struct *p = rq->idle; > unsigned long flags; > > /* cpu has to be offline */ > BUG_ON(cpu_online(cpu)); > > /* Strictly not necessary since rest of the CPUs are stopped by now > * and interrupts disabled on current cpu. > */ > spin_lock_irqsave(&rq->lock, flags); > > /* Add idle task to _front_ of it's priority queue */ > dequeue_task(p); > __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); > enqueue_task_head(p, rq, 0); > rq->nr_running++; > > spin_unlock_irqrestore(&rq->lock, flags); >} >#endif /* CONFIG_HOTPLUG_CPU */ > >/* > * migration_call - callback that gets triggered when a CPU is added. > * Here we can start up the necessary migration thread for the new CPU. > */ >static int migration_call(struct notifier_block *nfb, unsigned long action, > void *hcpu) >{ > int cpu = (long)hcpu; > struct task_struct *p; > struct runqueue *rq; > unsigned long flags; > > switch (action) { > case CPU_UP_PREPARE: > p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); > if (IS_ERR(p)) > return NOTIFY_BAD; > kthread_bind(p, cpu); > /* Must be high prio: stop_machine expects to yield to it. */ > rq = task_rq_lock(p, &flags); > __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); > task_rq_unlock(rq, &flags); > cpu_rq(cpu)->migration_thread = p; > break; > case CPU_ONLINE: > /* Strictly unneccessary, as first user will wake it. */ > wake_up_process(cpu_rq(cpu)->migration_thread); > break; >#ifdef CONFIG_HOTPLUG_CPU > case CPU_UP_CANCELED: > /* Unbind it from offline cpu so it can run. Fall thru. */ > kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); > kthread_stop(cpu_rq(cpu)->migration_thread); > cpu_rq(cpu)->migration_thread = NULL; > break; > case CPU_DEAD: > migrate_all_tasks(cpu); > rq = cpu_rq(cpu); > kthread_stop(rq->migration_thread); > rq->migration_thread = NULL; > /* Idle task back to normal in IDLE_PRIO slot */ > rq = task_rq_lock(rq->idle, &flags); > deactivate_task(rq->idle, rq); > rq->idle->static_prio = IDLE_PRIO; > __setscheduler(rq->idle, SCHED_NORMAL, 0); > enqueue_task(rq->idle, rq, IDLE_PRIO); > task_rq_unlock(rq, &flags); > BUG_ON(rq->nr_running != 0); > > /* No need to migrate the tasks: it was best-effort if > * they didn't do lock_cpu_hotplug(). Just wake up > * the requestors. */ > spin_lock_irq(&rq->lock); > while (!list_empty(&rq->migration_queue)) { > migration_req_t *req; > req = list_entry(rq->migration_queue.next, > migration_req_t, list); > BUG_ON(req->type != REQ_MOVE_TASK); > list_del_init(&req->list); > complete(&req->done); > } > spin_unlock_irq(&rq->lock); > break; >#endif > } > return NOTIFY_OK; >} > >/* Register at highest priority so that task migration (migrate_all_tasks) > * happens before everything else. > */ >static struct notifier_block __devinitdata migration_notifier = { > .notifier_call = migration_call, > .priority = 10 >}; > >int __init migration_init(void) >{ > void *cpu = (void *)(long)smp_processor_id(); > /* Start one for boot CPU. */ > migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); > migration_call(&migration_notifier, CPU_ONLINE, cpu); > register_cpu_notifier(&migration_notifier); > return 0; >} >#endif > >/* > * The 'big kernel lock' > * > * This spinlock is taken and released recursively by lock_kernel() > * and unlock_kernel(). It is transparently dropped and reaquired > * over schedule(). It is used to protect legacy code that hasn't > * been migrated to a proper locking design yet. > * > * Don't use in new code. > * > * Note: spinlock debugging needs this even on !CONFIG_SMP. > */ >spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; >EXPORT_SYMBOL(kernel_flag); > >#ifdef CONFIG_SMP >/* Attach the domain 'sd' to 'cpu' as its base domain */ >void cpu_attach_domain(struct sched_domain *sd, int cpu) >{ > migration_req_t req; > unsigned long flags; > runqueue_t *rq = cpu_rq(cpu); > int local = 1; > > lock_cpu_hotplug(); > > spin_lock_irqsave(&rq->lock, flags); > > if (cpu == smp_processor_id() || !cpu_online(cpu)) { > rq->sd = sd; > } else { > init_completion(&req.done); > req.type = REQ_SET_DOMAIN; > req.sd = sd; > list_add(&req.list, &rq->migration_queue); > local = 0; > } > > spin_unlock_irqrestore(&rq->lock, flags); > > if (!local) { > wake_up_process(rq->migration_thread); > wait_for_completion(&req.done); > } > > unlock_cpu_hotplug(); >} > >#ifdef ARCH_HAS_SCHED_DOMAIN >extern void __init arch_init_sched_domains(void); >#else >static struct sched_group sched_group_cpus[NR_CPUS]; >static DEFINE_PER_CPU(struct sched_domain, cpu_domains); >#ifdef CONFIG_NUMA >static struct sched_group sched_group_nodes[MAX_NUMNODES]; >static DEFINE_PER_CPU(struct sched_domain, node_domains); >static void __init arch_init_sched_domains(void) >{ > int i; > struct sched_group *first_node = NULL, *last_node = NULL; > > /* Set up domains */ > for_each_cpu(i) { > int node = cpu_to_node(i); > cpumask_t nodemask = node_to_cpumask(node); > struct sched_domain *node_sd = &per_cpu(node_domains, i); > struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); > > *node_sd = SD_NODE_INIT; > node_sd->span = cpu_possible_map; > node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; > > *cpu_sd = SD_CPU_INIT; > cpus_and(cpu_sd->span, nodemask, cpu_possible_map); > cpu_sd->groups = &sched_group_cpus[i]; > cpu_sd->parent = node_sd; > } > > /* Set up groups */ > for (i = 0; i < MAX_NUMNODES; i++) { > cpumask_t tmp = node_to_cpumask(i); > cpumask_t nodemask; > struct sched_group *first_cpu = NULL, *last_cpu = NULL; > struct sched_group *node = &sched_group_nodes[i]; > int j; > > cpus_and(nodemask, tmp, cpu_possible_map); > > if (cpus_empty(nodemask)) > continue; > > node->cpumask = nodemask; > node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); > > for_each_cpu_mask(j, node->cpumask) { > struct sched_group *cpu = &sched_group_cpus[j]; > > cpus_clear(cpu->cpumask); > cpu_set(j, cpu->cpumask); > cpu->cpu_power = SCHED_LOAD_SCALE; > > if (!first_cpu) > first_cpu = cpu; > if (last_cpu) > last_cpu->next = cpu; > last_cpu = cpu; > } > last_cpu->next = first_cpu; > > if (!first_node) > first_node = node; > if (last_node) > last_node->next = node; > last_node = node; > } > last_node->next = first_node; > > mb(); > for_each_cpu(i) { > struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); > cpu_attach_domain(cpu_sd, i); > } >} > >#else /* !CONFIG_NUMA */ >static void __init arch_init_sched_domains(void) >{ > int i; > struct sched_group *first_cpu = NULL, *last_cpu = NULL; > > /* Set up domains */ > for_each_cpu(i) { > struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); > > *cpu_sd = SD_CPU_INIT; > cpu_sd->span = cpu_possible_map; > cpu_sd->groups = &sched_group_cpus[i]; > } > > /* Set up CPU groups */ > for_each_cpu_mask(i, cpu_possible_map) { > struct sched_group *cpu = &sched_group_cpus[i]; > > cpus_clear(cpu->cpumask); > cpu_set(i, cpu->cpumask); > cpu->cpu_power = SCHED_LOAD_SCALE; > > if (!first_cpu) > first_cpu = cpu; > if (last_cpu) > last_cpu->next = cpu; > last_cpu = cpu; > } > last_cpu->next = first_cpu; > > mb(); /* domains were modified outside the lock */ > for_each_cpu(i) { > struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); > cpu_attach_domain(cpu_sd, i); > } >} > >#endif /* CONFIG_NUMA */ >#endif /* ARCH_HAS_SCHED_DOMAIN */ > >#define SCHED_DOMAIN_DEBUG >#ifdef SCHED_DOMAIN_DEBUG >void sched_domain_debug(void) >{ > int i; > > for_each_cpu(i) { > runqueue_t *rq = cpu_rq(i); > struct sched_domain *sd; > int level = 0; > > sd = rq->sd; > > printk(KERN_DEBUG "CPU%d: %s\n", > i, (cpu_online(i) ? " online" : "offline")); > > do { > int j; > char str[NR_CPUS]; > struct sched_group *group = sd->groups; > cpumask_t groupmask, tmp; > > cpumask_scnprintf(str, NR_CPUS, sd->span); > cpus_clear(groupmask); > > printk(KERN_DEBUG); > for (j = 0; j < level + 1; j++) > printk(" "); > printk("domain %d: span %s\n", level, str); > > if (!cpu_isset(i, sd->span)) > printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); > if (!cpu_isset(i, group->cpumask)) > printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); > if (!group->cpu_power) > printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); > > printk(KERN_DEBUG); > for (j = 0; j < level + 2; j++) > printk(" "); > printk("groups:"); > do { > if (!group) { > printk(" ERROR: NULL"); > break; > } > > if (!cpus_weight(group->cpumask)) > printk(" ERROR empty group:"); > > cpus_and(tmp, groupmask, group->cpumask); > if (cpus_weight(tmp) > 0) > printk(" ERROR repeated CPUs:"); > > cpus_or(groupmask, groupmask, group->cpumask); > > cpumask_scnprintf(str, NR_CPUS, group->cpumask); > printk(" %s", str); > > group = group->next; > } while (group != sd->groups); > printk("\n"); > > if (!cpus_equal(sd->span, groupmask)) > printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); > > level++; > sd = sd->parent; > > if (sd) { > cpus_and(tmp, groupmask, sd->span); > if (!cpus_equal(tmp, groupmask)) > printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); > } > > } while (sd); > } >} >#else >#define sched_domain_debug() {} >#endif > >void __init sched_init_smp(void) >{ > arch_init_sched_domains(); > sched_domain_debug(); >} >#else >void __init sched_init_smp(void) >{ >} >#endif /* CONFIG_SMP */ > >int in_sched_functions(unsigned long addr) >{ > /* Linker adds these: start and end of __sched functions */ > extern char __sched_text_start[], __sched_text_end[]; > return addr >= (unsigned long)__sched_text_start > && addr < (unsigned long)__sched_text_end; >} > >void __init sched_init(void) >{ > runqueue_t *rq; > int i, k; > >#ifdef CONFIG_SMP > /* Set up an initial dummy domain for early boot */ > static struct sched_domain sched_domain_init; > static struct sched_group sched_group_init; > cpumask_t cpu_mask_all = CPU_MASK_ALL; > > memset(&sched_domain_init, 0, sizeof(struct sched_domain)); > sched_domain_init.span = cpu_mask_all; > sched_domain_init.groups = &sched_group_init; > sched_domain_init.last_balance = jiffies; > sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ > > memset(&sched_group_init, 0, sizeof(struct sched_group)); > sched_group_init.cpumask = cpu_mask_all; > sched_group_init.next = &sched_group_init; > sched_group_init.cpu_power = SCHED_LOAD_SCALE; >#endif > > for (i = 0; i < NR_CPUS; i++) { > rq = cpu_rq(i); > spin_lock_init(&rq->lock); > > rq->cache_ticks = 0; > rq->preempted = 0; > >#ifdef CONFIG_SMP > rq->sd = &sched_domain_init; > rq->cpu_load = 0; > rq->active_balance = 0; > rq->push_cpu = 0; > rq->migration_thread = NULL; > INIT_LIST_HEAD(&rq->migration_queue); >#endif > atomic_set(&rq->nr_iowait, 0); > > for (k = 0; k <= IDLE_PRIO; k++) { > rq->queues[k].prio = k; > INIT_LIST_HEAD(&rq->queues[k].queue); > } > bitmap_zero(rq->bitmap, NUM_PRIO_SLOTS); > // delimiter for bitsearch > __set_bit(IDLE_PRIO, rq->bitmap); > rq->current_prio_slot = rq->queues + (IDLE_PRIO - 20); > rq->timestamp_last_tick = sched_clock(); > rq->next_prom_due = (jiffies + get_prom_interval(rq)); > rq->total_delay = 0; > rq->eb_yardstick = 0; > rq->eb_ticks_to_decay += time_slice_ticks; > } > /* > * We have to do a little magic to get the first > * thread right in SMP mode. > */ > rq = this_rq(); > rq->curr = current; > rq->idle = current; > set_task_cpu(current, smp_processor_id()); > wake_up_forked_process(current); > > /* > * The boot idle thread does lazy MMU switching as well: > */ > atomic_inc(&init_mm.mm_count); > enter_lazy_tlb(&init_mm, current); >} > >#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP >void __might_sleep(char *file, int line) >{ >#if defined(in_atomic) > static unsigned long prev_jiffy; /* ratelimiting */ > > if ((in_atomic() || irqs_disabled()) && > system_state == SYSTEM_RUNNING) { > if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) > return; > prev_jiffy = jiffies; > printk(KERN_ERR "Debug: sleeping function called from invalid" > " context at %s:%d\n", file, line); > printk("in_atomic():%d, irqs_disabled():%d\n", > in_atomic(), irqs_disabled()); > dump_stack(); > } >#endif >} >EXPORT_SYMBOL(__might_sleep); >#endif > > >#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) >/* > * This could be a long-held lock. If another CPU holds it for a long time, > * and that CPU is not asked to reschedule then *this* CPU will spin on the > * lock for a long time, even if *this* CPU is asked to reschedule. > * > * So what we do here, in the slow (contended) path is to spin on the lock by > * hand while permitting preemption. > * > * Called inside preempt_disable(). > */ >void __sched __preempt_spin_lock(spinlock_t *lock) >{ > if (preempt_count() > 1) { > _raw_spin_lock(lock); > return; > } > do { > preempt_enable(); > while (spin_is_locked(lock)) > cpu_relax(); > preempt_disable(); > } while (!_raw_spin_trylock(lock)); >} > >EXPORT_SYMBOL(__preempt_spin_lock); > >void __sched __preempt_write_lock(rwlock_t *lock) >{ > if (preempt_count() > 1) { > _raw_write_lock(lock); > return; > } > > do { > preempt_enable(); > while (rwlock_is_locked(lock)) > cpu_relax(); > preempt_disable(); > } while (!_raw_write_trylock(lock)); >} > >EXPORT_SYMBOL(__preempt_write_lock); >#endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */ > >#if defined(CONFIG_SYSCTL) >/* > * CPU scheduler control via /proc/sys/cpusched/xxx > */ >enum >{ > CPU_SCHED_END_OF_LIST=0, > CPU_SCHED_TIME_SLICE=1, > CPU_SCHED_BASE_PROMOTION_INTERVAL, > CPU_SCHED_MAX_IA_BONUS, > CPU_SCHED_MAX_TPT_BONUS, > CPU_SCHED_IA_THRESHOLD, > CPU_SCHED_CPU_HOG_THRESHOLD, > CPU_SCHED_LOG_AT_EXIT, > CPU_SCHED_INTERACTIVE, > CPU_SCHED_COMPUTE, > CPU_SCHED_MODE, > CPU_SCHED_INITIAL_IA_BONUS, > CPU_SCHED_HOG_SUB_CYCLE_THRESHOLD >}; > >static const unsigned int zero = 0; >static const unsigned int one = 1; >#define min_milli_value zero >static const unsigned int max_milli_value = 1000; >#define min_max_ia_bonus zero >static const unsigned int max_max_ia_bonus = MAX_MAX_IA_BONUS; >#define min_max_tpt_bonus zero >static const unsigned int max_max_tpt_bonus = MAX_MAX_TPT_BONUS; >static unsigned int time_slice_msecs = DEFAULT_TIME_SLICE_MSECS; >#define min_time_slice_msecs one >static const unsigned int max_time_slice_msecs = MAX_TIME_SLICE_MSECS; >static unsigned int base_prom_interval_msecs = BASE_PROM_INTERVAL_MSECS; >#define min_base_prom_interval_msecs one >static const unsigned int max_base_prom_interval_msecs = INT_MAX; >#define max_hog_sub_cycle_threshold max_base_prom_interval_msecs > >static int proc_time_slice_msecs(ctl_table *ctp, int write, struct file *fp, > void __user *buffer, size_t *lenp) >{ > int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); > > if ((res == 0) && write) > time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(time_slice_msecs); > > return res; >} > >static int proc_base_prom_interval_msecs(ctl_table *ctp, int write, struct file *fp, > void __user *buffer, size_t *lenp) >{ > int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); > > if ((res == 0) && write) > base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(base_prom_interval_msecs); > > return res; >} > >static int proc_cpu_hog_threshold(ctl_table *ctp, int write, struct file *fp, > void __user *buffer, size_t *lenp) >{ > int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); > > if ((res == 0) && write) > cpu_hog_threshold = calc_proportion(cpu_hog_threshold_ppt, 1000); > > return res; >} > >static int proc_ia_threshold(ctl_table *ctp, int write, struct file *fp, > void __user *buffer, size_t *lenp) >{ > int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); > > if ((res == 0) && write) > ia_threshold = calc_proportion(ia_threshold_ppt, 1000); > > return res; >} > >#define SCHED_MODE_BUFFER_LEN 16 >static char current_sched_mode[SCHED_MODE_BUFFER_LEN] = ""; >static int proc_sched_mode(ctl_table *ctp, int write, struct file *fp, > void __user *buffer, size_t *lenp) >{ > int res; > > strcpy(current_sched_mode, sched_mode_names[sched_mode]); > res = proc_dostring(ctp, write, fp, buffer, lenp); > > if ((res == 0) && write) { > int i; > > for (i = 0; sched_mode_names[i] != NULL; i++) > if (strcmp(current_sched_mode, sched_mode_names[i]) == 0) > break; > if (sched_mode_names[i] == NULL) > res = -EINVAL; > else /* set the scheduling mode */ > sched_mode = i; > } > > return res; >} > >ctl_table cpu_sched_table[] = { > { > .ctl_name = CPU_SCHED_TIME_SLICE, > .procname = "time_slice", > .data = &time_slice_msecs, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_time_slice_msecs, > .extra1 = (void *)&min_time_slice_msecs, > .extra2 = (void *)&max_time_slice_msecs > }, > { > .ctl_name = CPU_SCHED_BASE_PROMOTION_INTERVAL, > .procname = "base_promotion_interval", > .data = &base_prom_interval_msecs, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_base_prom_interval_msecs, > .extra1 = (void *)&min_base_prom_interval_msecs, > .extra2 = (void *)&max_base_prom_interval_msecs > }, > { > .ctl_name = CPU_SCHED_MAX_IA_BONUS, > .procname = "max_ia_bonus", > .data = &max_ia_bonus, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_dointvec_minmax, > .extra1 = (void *)&min_max_ia_bonus, > .extra2 = (void *)&max_max_ia_bonus > }, > { > .ctl_name = CPU_SCHED_INITIAL_IA_BONUS, > .procname = "initial_ia_bonus", > .data = &initial_ia_bonus, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_dointvec_minmax, > .extra1 = (void *)&min_max_ia_bonus, > .extra2 = (void *)&max_max_ia_bonus > }, > { > .ctl_name = CPU_SCHED_MAX_TPT_BONUS, > .procname = "max_tpt_bonus", > .data = &max_tpt_bonus, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_dointvec_minmax, > .extra1 = (void *)&min_max_tpt_bonus, > .extra2 = (void *)&max_max_tpt_bonus > }, > { > .ctl_name = CPU_SCHED_HOG_SUB_CYCLE_THRESHOLD, > .procname = "hog_sub_cycle_threshold", > .data = &hog_sub_cycle_threshold, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_dointvec_minmax, > .extra1 = (void *)&zero, > .extra2 = (void *)&max_hog_sub_cycle_threshold > }, > { > .ctl_name = CPU_SCHED_IA_THRESHOLD, > .procname = "ia_threshold", > .data = &ia_threshold_ppt, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_ia_threshold, > .extra1 = (void *)&min_milli_value, > .extra2 = (void *)&max_milli_value > }, > { > .ctl_name = CPU_SCHED_CPU_HOG_THRESHOLD, > .procname = "cpu_hog_threshold", > .data = &cpu_hog_threshold_ppt, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_cpu_hog_threshold, > .extra1 = (void *)&min_milli_value, > .extra2 = (void *)&max_milli_value > }, > { > .ctl_name = CPU_SCHED_LOG_AT_EXIT, > .procname = "log_at_exit", > .data = &log_at_exit, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_dointvec_minmax, > .extra1 = (void *)&zero, > .extra2 = (void *)&one > }, > { > .ctl_name = CPU_SCHED_INTERACTIVE, > .procname = "interactive", > .data = &sched_interactive, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_dointvec_minmax, > .extra1 = (void *)&zero, > .extra2 = (void *)&one > }, > { > .ctl_name = CPU_SCHED_COMPUTE, > .procname = "compute", > .data = &sched_compute, > .maxlen = sizeof (unsigned int), > .mode = 0644, > .proc_handler = &proc_dointvec_minmax, > .extra1 = (void *)&zero, > .extra2 = (void *)&one > }, > { > .ctl_name = CPU_SCHED_MODE, > .procname = "mode", > .data = ¤t_sched_mode, > .maxlen = SCHED_MODE_BUFFER_LEN, > .mode = 0644, > .proc_handler = &proc_sched_mode, > }, > { .ctl_name = CPU_SCHED_END_OF_LIST } >}; >#endif
You cannot view the attachment while viewing its details because your browser does not support IFRAMEs.
View the attachment on a separate page
.
View Attachment As Raw
Actions:
View
Attachments on
bug 69330
: 12973