/* * kernel/sched.c * * Kernel scheduler and related syscalls * * Copyright (C) 1991-2002 Linus Torvalds * * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and * make semaphores SMP safe * 1998-11-19 Implemented schedule_timeout() and related stuff * by Andrea Arcangeli * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: * hybrid priority-list and round-robin design with * an array-switch method of distributing timeslices * and per-CPU runqueues. Cleanups and useful suggestions * by Davide Libenzi, preemptible kernel bits by Robert Love. * 2003-09-03 Interactivity tuning by Con Kolivas. * 2004-04-02 Scheduler domains code by Nick Piggin * 2004-06-11 New staircase scheduling policy by Con Kolivas with help * from William Lee Irwin III, Zwane Mwaikambo & Peter Williams. * 2004-06-03 Single priority array, simplified interactive bonus * mechanism and throughput bonus mechanism by Peter Williams * (Courtesy of Aurema Pty Ltd, www.aurema.com) */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include enum sched_mode_enum { SCHED_MODE_STAIRCASE, SCHED_MODE_PRIORITY_BASED, SCHED_MODE_ENTITLEMENT_BASED }; static enum sched_mode_enum sched_mode = SCHED_MODE_STAIRCASE; #ifdef CONFIG_SYSCTL static const char *sched_mode_names[] = { "sc", /* SCHED_MODE_STAIRCASE */ "pb", /* SCHED_MODE_PRIORITY_BASED */ "eb", /* SCHED_MODE_ENTITLEMENT_BASED */ NULL /* end of list marker */ }; #endif /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], * and back. */ #define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) #define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) #define TASK_NICE(p) PRIO_TO_NICE((p)->static_prio) /* * 'User priority' is the nice value converted to something we * can work with better when scaling various scheduler parameters, * it's a [ 0 ... 39 ] range. */ #define USER_PRIO(p) ((p)-MAX_RT_PRIO) #define TASK_USER_PRIO(p) USER_PRIO((p)->static_prio) #define MAX_USER_PRIO (USER_PRIO(MAX_PRIO)) /* * Some helpers for converting nanosecond timing to jiffy resolution */ #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) static int sched_compute = 0; /* *This is the time all tasks within the same priority round robin. *compute setting is reserved for dedicated computational scheduling *and has ten times larger intervals. */ #define _RR_INTERVAL ((10 * HZ / 1000) ? : 1) #define RR_INTERVAL() (_RR_INTERVAL * (1 + 9 * sched_compute)) /* * These are the 'tuning knobs' of the scheduler: * Making MAX_TOTAL_BONUS bigger than 19 causes mysterious crashes during boot * this causes the number of longs in the bitmap to increase from 5 to 6 * and that's a limit on bit map size P.W. */ #define MAX_TOTAL_BONUS 19 #define MAX_MAX_IA_BONUS 10 #define MAX_MAX_TPT_BONUS (MAX_TOTAL_BONUS - MAX_MAX_IA_BONUS) #define DEFAULT_MAX_IA_BONUS MAX_MAX_IA_BONUS #define DEFAULT_MAX_TPT_BONUS ((DEFAULT_MAX_IA_BONUS) / 2) static unsigned int max_ia_bonus = DEFAULT_MAX_IA_BONUS; static unsigned int initial_ia_bonus = 1; static unsigned int max_tpt_bonus = DEFAULT_MAX_TPT_BONUS; /* * Define some mini Kalman filter for estimating various averages, etc. * To make it more efficient the denominator of the fixed point rational * numbers used to store the averages and the response half life will * be chosen so that the fixed point rational number reperesentation * of (1 - alpha) * i (where i is an integer) will be i. * Some of this is defined in linux/sched.h */ /* * Fixed denominator rational numbers for use by the CPU scheduler */ #define SCHED_AVG_OFFSET 4 /* * Get the rounded integer value of a scheduling statistic average field * i.e. those fields whose names begin with avg_ */ #define SCHED_AVG_RND(x) \ (((x) + (1 << (SCHED_AVG_OFFSET - 1))) >> (SCHED_AVG_OFFSET)) #define SCHED_AVG_ALPHA ((1 << SCHED_AVG_OFFSET) - 1) #define SCHED_AVG_MUL(a, b) (((a) * (b)) >> SCHED_AVG_OFFSET) #define SCHED_AVG_REAL(a) ((a) << SCHED_AVG_OFFSET) /* * Convert nice to shares * Proportional symmetry is aimed for: i.e. * (nice_to_shares(0) / nice_to_shares(19)) == (nice_to_shares(-20) / nice_to_shares(0)) * Make sure that this function is robust for variations of EB_SHARES_PER_NICE */ static inline unsigned int nice_to_shares(int nice) { unsigned int result = DEFAULT_EB_SHARES; if (nice > 0) result -= (nice * (20 * EB_SHARES_PER_NICE - 1)) / 19; else if (nice < 0) result += (nice * nice * ((20 * EB_SHARES_PER_NICE - 1) * EB_SHARES_PER_NICE)) / 20; return result; } #define SCHED_IA_BONUS_OFFSET 8 #define SCHED_IA_BONUS_ALPHA ((1 << SCHED_IA_BONUS_OFFSET) - 1) #define SCHED_IA_BONUS_MUL(a, b) (((a) * (b)) >> SCHED_IA_BONUS_OFFSET) /* * Get the rounded integer value of the interactive bonus */ #define SCHED_IA_BONUS_RND(x) \ (((x) + (1 << (SCHED_IA_BONUS_OFFSET - 1))) >> (SCHED_IA_BONUS_OFFSET)) static inline void apply_sched_avg_decay(unsigned long long *valp) { *valp = SCHED_AVG_MUL(*valp, SCHED_AVG_ALPHA); } static inline void update_sched_ia_bonus(struct task_struct *p, unsigned long long incr) { p->interactive_bonus = SCHED_AVG_MUL(p->interactive_bonus, SCHED_AVG_ALPHA); p->interactive_bonus += incr; } static inline unsigned long long sched_div_64(unsigned long long a, unsigned long long b) { #if BITS_PER_LONG < 64 /* * Assume that there's no 64 bit divide available */ if (a < b) return 0; /* * Scale down until b less than 32 bits so that we can do * a divide using do_div() */ while (b > ULONG_MAX) { a >>= 1; b >>= 1; } (void)do_div(a, (unsigned long)b); return a; #else return a / b; #endif } #define PROPORTION_OFFSET 32 #define PROPORTION_ONE ((unsigned long long)1 << PROPORTION_OFFSET) #define PROPORTION_OVERFLOW (((unsigned long long)1 << (64 - PROPORTION_OFFSET)) - 1) #define PROP_FM_PPT(a) (((unsigned long long)(a) * PROPORTION_ONE) / 1000) /* * Convert a / b to a proportion in the range 0 to PROPORTION_ONE * Requires a <= b or may get a divide by zero exception */ static inline unsigned long long calc_proportion(unsigned long long a, unsigned long long b) { if (unlikely(a == b)) return PROPORTION_ONE; while (a > PROPORTION_OVERFLOW) { a >>= 1; b >>= 1; } return sched_div_64(a << PROPORTION_OFFSET, b); } /* * Map the given proportion to an unsigned long long in the specified range * Requires range < PROPORTION_ONE to avoid overflow */ static inline unsigned long long map_proportion(unsigned long long prop, unsigned long long range) { return (prop * range) >> PROPORTION_OFFSET; } static inline unsigned long long map_proportion_rnd(unsigned long long prop, unsigned long long range) { return map_proportion((prop >> 1), (range * 2 + 1)); } /* * Tasks that have a CPU usage rate greater than this threshold (in parts per * thousand) are considered to be CPU bound and start to lose interactive bonus * points */ #define DEFAULT_CPU_HOG_THRESHOLD 900 static unsigned int cpu_hog_threshold_ppt = DEFAULT_CPU_HOG_THRESHOLD; static unsigned long long cpu_hog_threshold = PROP_FM_PPT(DEFAULT_CPU_HOG_THRESHOLD); /* * Tasks that would sleep for more than 900 parts per thousand of the time if * they had the CPU to themselves are considered to be interactive provided * that their average sleep duration per scheduling cycle isn't too long */ #define DEFAULT_IA_THRESHOLD 900 static unsigned int ia_threshold_ppt = DEFAULT_IA_THRESHOLD; static unsigned long long ia_threshold = PROP_FM_PPT(DEFAULT_IA_THRESHOLD); #define LOWER_MAX_IA_SLEEP SCHED_AVG_REAL(15 * 60LL * NSEC_PER_SEC) #define UPPER_MAX_IA_SLEEP SCHED_AVG_REAL(2 * 60 * 60LL * NSEC_PER_SEC) /* * What "base time slice" for nice 0 and "average time slice" evaluated to */ #define MSECS_TO_JIFFIES(x) (((x) * (HZ * 2 + 1)) / 2000) #define MSECS_TO_JIFFIES_MIN_1(x) (MSECS_TO_JIFFIES(x) ? MSECS_TO_JIFFIES(x) : 1) #define DEFAULT_TIME_SLICE_MSECS 100 #define MAX_TIME_SLICE_MSECS 1000 static unsigned int time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(DEFAULT_TIME_SLICE_MSECS); static unsigned int slice(const task_t *p); static inline unsigned int task_timeslice(const task_t *p) { if (sched_mode == SCHED_MODE_STAIRCASE) return slice(p); return time_slice_ticks; } #define task_hot(p, now, sd) ((now) - (p)->timestamp < (sd)->cache_hot_time) /* * These are the runqueue data structures: */ #define IDLE_PRIO (MAX_PRIO + MAX_TOTAL_BONUS) #define NUM_PRIO_SLOTS (IDLE_PRIO + 1) /* * Is the run queue idle? */ #define RUNQUEUE_IDLE(rq) ((rq)->curr == (rq)->idle) /* * Control values for niceness */ #define PROSPECTIVE_BASE_PROM_INTERVAL_MSECS ((DEFAULT_TIME_SLICE_MSECS * 110) / 100) #if (PROSPECTIVE_BASE_PROM_INTERVAL_MSECS > 0) #define BASE_PROM_INTERVAL_MSECS PROSPECTIVE_BASE_PROM_INTERVAL_MSECS #else #define BASE_PROM_INTERVAL_MSECS DEFAULT_TIME_SLICE_MSECS #endif static unsigned int base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(BASE_PROM_INTERVAL_MSECS); typedef struct runqueue runqueue_t; struct prio_slot { unsigned int prio; struct list_head queue; }; /* * This is the main, per-CPU runqueue data structure. * * Locking rule: those places that want to lock multiple runqueues * (such as the load balancing or the thread migration code), lock * acquire operations must be ordered by ascending &runqueue. */ struct runqueue { spinlock_t lock; /* * nr_running and cpu_load should be in the same cacheline because * remote CPUs use both these fields when doing load calculation. */ unsigned long nr_running; #ifdef CONFIG_SMP unsigned long cpu_load; #endif unsigned long long nr_switches; unsigned long nr_uninterruptible; unsigned long long timestamp_last_tick; unsigned long long total_delay; unsigned int cache_ticks, preempted; task_t *curr, *idle; struct mm_struct *prev_mm; DECLARE_BITMAP(bitmap, NUM_PRIO_SLOTS); struct prio_slot queues[NUM_PRIO_SLOTS]; struct prio_slot *current_prio_slot; unsigned long next_prom_due; atomic_t nr_iowait; unsigned long long eb_yardstick; unsigned long long eb_ticks_to_decay; #ifdef CONFIG_SMP struct sched_domain *sd; /* For active balancing */ int active_balance; int push_cpu; task_t *migration_thread; struct list_head migration_queue; #endif }; static DEFINE_PER_CPU(struct runqueue, runqueues); #define for_each_domain(cpu, domain) \ for (domain = cpu_rq(cpu)->sd; domain; domain = domain->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) /* * Default context-switch locking: */ #ifndef prepare_arch_switch # define prepare_arch_switch(rq, next) do { } while (0) # define finish_arch_switch(rq, next) spin_unlock_irq(&(rq)->lock) # define task_running(rq, p) ((rq)->curr == (p)) #endif static inline unsigned long get_prom_interval(const struct runqueue *rq) { if (rq->nr_running < 2) return base_prom_interval_ticks; return (rq->nr_running - 1) * base_prom_interval_ticks; } static inline void decay_eb_yardstick(runqueue_t *rq) { static const unsigned long long decay_per_interval = PROP_FM_PPT(990); rq->eb_yardstick = map_proportion(decay_per_interval, rq->eb_yardstick); rq->eb_ticks_to_decay = time_slice_ticks; } #define EB_PAR 19 static inline void set_eb_yardstick(runqueue_t *rq, task_t *p) { rq->eb_yardstick = p->cpu_usage_rate_per_share; p->eb_priority = MAX_RT_PRIO + EB_PAR; rq->eb_ticks_to_decay = time_slice_ticks; } static inline int task_should_be_yardstick(const task_t *p, const runqueue_t *rq) { return (p->cpu_usage_rate_per_share > rq->eb_yardstick); } static inline void update_eb_yardstick(task_t *p, runqueue_t *rq) { if (unlikely(rt_task(p))) return; if (task_should_be_yardstick(p, rq)) set_eb_yardstick(rq, p); } /* * task_rq_lock - lock the runqueue a given task resides on and disable * interrupts. Note the ordering: we can safely lookup the task_rq without * explicitly disabling preemption. */ static runqueue_t *task_rq_lock(task_t *p, unsigned long *flags) { struct runqueue *rq; repeat_lock_task: local_irq_save(*flags); rq = task_rq(p); spin_lock(&rq->lock); if (unlikely(rq != task_rq(p))) { spin_unlock_irqrestore(&rq->lock, *flags); goto repeat_lock_task; } return rq; } static inline void task_rq_unlock(runqueue_t *rq, unsigned long *flags) { spin_unlock_irqrestore(&rq->lock, *flags); } /* * rq_lock - lock a given runqueue and disable interrupts. */ static runqueue_t *this_rq_lock(void) { runqueue_t *rq; local_irq_disable(); rq = this_rq(); spin_lock(&rq->lock); return rq; } static inline void rq_unlock(runqueue_t *rq) { spin_unlock_irq(&rq->lock); } static inline int preemption_warranted(unsigned int prio, const struct task_struct *p, runqueue_t *rq) { if (prio < rq->current_prio_slot->prio) { if (rt_task(p) || !sched_compute || rq->cache_ticks >= cache_decay_ticks || !p->mm || rq->curr == rq->idle) return 1; rq->preempted = 1; } return 0; } static inline int task_queued(const task_t *task) { return !list_empty(&task->run_list); } /* * Adding/removing a task to/from a runqueue: */ static void dequeue_task(struct task_struct *p) { /* * If p is the last task in this priority slot then slotp will be * a pointer to the head of the list in the sunqueue structure */ struct list_head *slotp = p->run_list.next; /* * Initialize after removal from the list so that list_empty() works * as a means for testing whether the task is runnable */ list_del_init(&p->run_list); if (list_empty(slotp)) __clear_bit(list_entry(slotp, struct prio_slot, queue)->prio, task_rq(p)->bitmap); } static void enqueue_task(struct task_struct *p, runqueue_t *rq, int prio) { list_add_tail(&p->run_list, &rq->queues[prio].queue); __set_bit(prio, rq->bitmap); } /* * Used by the migration code - we pull tasks from the head of the * remote queue so we want these tasks to show up at the head of the * local queue: */ static inline void enqueue_task_head(struct task_struct *p, runqueue_t *rq, int prio) { list_add(&p->run_list, &rq->queues[prio].queue); __set_bit(prio, rq->bitmap); } /* * __activate_task - move a task to the runqueue. */ static inline void __activate_task(task_t *p, runqueue_t *rq, int prio) { if (sched_mode != SCHED_MODE_STAIRCASE) p->time_slice = task_timeslice(p); enqueue_task(p, rq, prio); rq->nr_running++; } /* * burst - extra intervals an interactive task can run for at best priority * instead of descending priorities. */ static unsigned int burst(const task_t *p) { unsigned int task_user_prio; if (rt_task(p)) return p->burst; task_user_prio = TASK_USER_PRIO(p); if (likely(task_user_prio < 40)) return 39 - task_user_prio; else return 0; } static void inc_burst(task_t *p) { unsigned int best_burst; best_burst = burst(p); if (p->burst < best_burst) p->burst++; } static void dec_burst(task_t *p) { if (p->burst) p->burst--; } /* * slice - the duration a task runs before getting requeued at it's best * priority and has it's burst decremented. */ static unsigned int slice(const task_t *p) { unsigned int slice = RR_INTERVAL(); if (!rt_task(p)) slice += burst(p) * RR_INTERVAL(); return slice; } /* * sched_interactive - sysctl which allows interactive tasks to have bursts */ static int sched_interactive = 1; static int hog_sub_cycle_threshold = 10; /* * Calculate CPU usage rate and sleepiness. * This never gets called on real time tasks */ static void calculate_rates(task_t *p) { unsigned long long bl = p->avg_sleep_per_cycle + p->avg_cpu_per_cycle; if (unlikely(bl == 0)) { p->sleepiness = PROPORTION_ONE; p->cpu_usage_rate = 0; } else { unsigned long long edpss = p->avg_delay_per_cycle; p->sleepiness = calc_proportion(p->avg_sleep_per_cycle, bl); p->cpu_usage_rate = calc_proportion(p->avg_cpu_per_cycle, edpss + bl); if (unlikely(p->sub_cycle_count > hog_sub_cycle_threshold)) { unsigned long long scu; unsigned long long sbl; sbl = p->avg_delay_per_sub_cycle + p->avg_cpu_per_sub_cycle; scu = calc_proportion(p->avg_cpu_per_sub_cycle, sbl); /* if (scu > p->cpu_usage_rate) */ p->cpu_usage_rate = scu; } } p->cpu_usage_rate_per_share = sched_div_64(p->cpu_usage_rate, p->eb_shares); } /* * Calculate entitlement based priority. * This never gets called on real time tasks */ static void calculate_eb_priority(task_t *p, const runqueue_t *rq) { /* * Prevent possible divide by zero and take shortcut */ if (unlikely(p->cpu_usage_rate_per_share == 0)) { p->eb_priority = MAX_RT_PRIO; } else if (unlikely(p->cpu_usage_rate_per_share > rq->eb_yardstick)) { unsigned long long prop = calc_proportion(rq->eb_yardstick, p->cpu_usage_rate_per_share); p->eb_priority = MAX_PRIO - map_proportion_rnd(prop, EB_PAR + 1); } else { unsigned long long prop = calc_proportion(p->cpu_usage_rate_per_share, rq->eb_yardstick); p->eb_priority = MAX_RT_PRIO + map_proportion_rnd(prop, EB_PAR); } } /* * Update various statistics for the end of a * ((on_run_queue :-> on_cpu)* :-> sleep) cycle. * We can't just do this in activate_task() as every invocation of that * function is not the genuine end of a cycle. */ static void update_stats_for_cycle(task_t *p, const runqueue_t *rq) { unsigned long long delta; apply_sched_avg_decay(&p->avg_delay_per_cycle); apply_sched_avg_decay(&p->avg_cpu_per_cycle); delta = (rq->timestamp_last_tick - p->sched_timestamp); p->avg_sleep_per_cycle += delta; p->total_sleep += delta; /* * Do this second so that averages for all measures are for * the current cycle */ apply_sched_avg_decay(&p->avg_sleep_per_cycle); p->sched_timestamp = rq->timestamp_last_tick; p->sub_cycle_count = 0; p->cycle_count++; if (!rt_task(p)) { /* we con't care about these for real time tasks */ apply_sched_avg_decay(&p->avg_delay_per_sub_cycle); apply_sched_avg_decay(&p->avg_cpu_per_sub_cycle); if (sched_mode != SCHED_MODE_STAIRCASE) { calculate_rates(p); if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) calculate_eb_priority(p, rq); } } } /* * Check whether a task with an interactive bonus still qualifies and if not * decrease its bonus * This never gets called on real time tasks */ static void reassess_cpu_boundness(task_t *p) { /* * No point going any further if there's no bonus to lose */ if (p->interactive_bonus == 0) return; if (p->cpu_usage_rate > cpu_hog_threshold) update_sched_ia_bonus(p, 0); } /* * Check whether a task qualifies for an interactive bonus and if it does * increase its bonus * This never gets called on real time tasks */ static void reassess_interactiveness(task_t *p) { /* * No sleep means not interactive (in most cases), but */ if (p->avg_sleep_per_cycle > LOWER_MAX_IA_SLEEP) { /* * Really long sleeps mean it's probably not interactive */ if (p->avg_sleep_per_cycle > UPPER_MAX_IA_SLEEP) update_sched_ia_bonus(p, 0); return; } if (p->sleepiness > ia_threshold) update_sched_ia_bonus(p, p->sleepiness); else if (p->sub_cycle_count == 0) reassess_cpu_boundness(p); } /* * Check whether a task qualifies for a throughput bonus and if it does * give it one * This never gets called on real time tasks */ static void recalc_throughput_bonus(task_t *p, unsigned long long load) { if (unlikely(p->sub_cycle_count > hog_sub_cycle_threshold)) { /* * No delay means no bonus, but * NB this test also avoids a possible divide by zero error if * cpu is also zero */ if (p->avg_delay_per_sub_cycle == 0) { p->throughput_bonus = 0; return; } p->throughput_bonus = calc_proportion(p->avg_delay_per_sub_cycle, p->avg_delay_per_sub_cycle + load * p->avg_cpu_per_sub_cycle); return; } /* * No delay means no bonus, but * NB this test also avoids a possible divide by zero error if * cpu is also zero */ if (p->avg_delay_per_cycle == 0) { p->throughput_bonus = 0; return; } p->throughput_bonus = calc_proportion(p->avg_delay_per_cycle, p->avg_delay_per_cycle + load * p->avg_cpu_per_cycle); } /* * effective_prio - dynamic priority dependent on burst. * The priority normally decreases by one each RR_INTERVAL. * As the burst increases the priority stays at the top "stair" or * priority for longer. */ static int effective_prio(task_t *p) { int prio; unsigned int full_slice, used_slice, first_slice; unsigned int best_burst; unsigned int miabl, mtpbl, bonus_factor; if (rt_task(p)) return (MAX_USER_RT_PRIO - 1) - p->rt_priority; switch (sched_mode) { case SCHED_MODE_STAIRCASE: goto staircase_prio; case SCHED_MODE_ENTITLEMENT_BASED: prio = p->eb_priority; break; default: prio = p->static_prio; } /* * kernel threads get maximum bonuses */ if (p->mm == NULL) return prio; miabl = max_ia_bonus; mtpbl = max_tpt_bonus; bonus_factor = (miabl + mtpbl); bonus_factor -= map_proportion_rnd(SCHED_IA_BONUS_RND(p->interactive_bonus), miabl); bonus_factor -= map_proportion_rnd(p->throughput_bonus, mtpbl); return prio + bonus_factor; staircase_prio: best_burst = burst(p); full_slice = slice(p); used_slice = full_slice - p->slice; if (p->burst > best_burst) p->burst = best_burst; first_slice = RR_INTERVAL(); if (sched_interactive && !sched_compute) first_slice *= (p->burst + 1); prio = MAX_PRIO - 1 - best_burst; if (used_slice < first_slice) return prio; prio += 1 + (used_slice - first_slice) / RR_INTERVAL(); if (prio > MAX_PRIO - 1) prio = MAX_PRIO - 1; return prio; } /* * recalc_task_prio - this checks for tasks that run ultra short timeslices * or have just forked a thread/process and make them continue their old * slice instead of starting a new one at high priority. * This is not called on real time tasks */ static void recalc_task_prio(task_t *p, unsigned long long now) { unsigned long sleep_time = now - p->timestamp; unsigned long ns_totalrun = p->totalrun + p->runtime; unsigned long total_run = NS_TO_JIFFIES(ns_totalrun); if (p->flags & PF_FORKED || ((!(NS_TO_JIFFIES(p->runtime)) || !sched_interactive || sched_compute) && NS_TO_JIFFIES(p->runtime + sleep_time) < RR_INTERVAL())) { p->flags &= ~PF_FORKED; if (p->slice - total_run < 1) { p->totalrun = 0; dec_burst(p); } else { p->totalrun = ns_totalrun; p->slice -= total_run; } } else { if (!(p->flags & PF_UISLEEP)) inc_burst(p); p->runtime = 0; p->totalrun = 0; } } /* * activate_task - move a task to the runqueue and do priority recalculation * return prio to allow preemption testing */ static int activate_task(task_t *p, runqueue_t *rq, int local) { int prio; unsigned long long now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ runqueue_t *this_rq = this_rq(); now = (now - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; } #endif if (sched_mode == SCHED_MODE_STAIRCASE) { if (!rt_task(p)) { p->slice = slice(p); recalc_task_prio(p, now); } p->time_slice = RR_INTERVAL(); } p->flags &= ~PF_UISLEEP; prio = effective_prio(p); p->timestamp = now; __activate_task(p, rq, prio); return prio; } /* * deactivate_task - remove a task from the runqueue. */ static void deactivate_task(struct task_struct *p, runqueue_t *rq) { rq->nr_running--; if (p->state == TASK_UNINTERRUPTIBLE) { p->flags |= PF_UISLEEP; rq->nr_uninterruptible++; } dequeue_task(p); } /* * resched_task - mark a task 'to be rescheduled now'. * * On UP this means the setting of the need_resched flag, on SMP it * might also involve a cross-CPU call to trigger the scheduler on * the target CPU. */ #ifdef CONFIG_SMP static void resched_task(task_t *p) { int need_resched, nrpolling; preempt_disable(); /* minimise the chance of sending an interrupt to poll_idle() */ nrpolling = test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); need_resched = test_and_set_tsk_thread_flag(p,TIF_NEED_RESCHED); nrpolling |= test_tsk_thread_flag(p,TIF_POLLING_NRFLAG); if (!need_resched && !nrpolling && (task_cpu(p) != smp_processor_id())) smp_send_reschedule(task_cpu(p)); preempt_enable(); } #else static inline void resched_task(task_t *p) { set_tsk_need_resched(p); } #endif /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. */ inline int task_curr(task_t *p) { return cpu_curr(task_cpu(p)) == p; } #ifdef CONFIG_SMP enum request_type { REQ_MOVE_TASK, REQ_SET_DOMAIN, }; typedef struct { struct list_head list; enum request_type type; /* For REQ_MOVE_TASK */ task_t *task; int dest_cpu; /* For REQ_SET_DOMAIN */ struct sched_domain *sd; struct completion done; } migration_req_t; /* * The task's runqueue lock must be held. * Returns true if you have to wait for migration thread. */ static int migrate_task(task_t *p, int dest_cpu, migration_req_t *req) { runqueue_t *rq = task_rq(p); /* * If the task is not on a runqueue (and not running), then * it is sufficient to simply update the task's cpu field. */ if (!task_queued(p) && !task_running(rq, p)) { set_task_cpu(p, dest_cpu); return 0; } init_completion(&req->done); req->type = REQ_MOVE_TASK; req->task = p; req->dest_cpu = dest_cpu; list_add(&req->list, &rq->migration_queue); return 1; } /* * wait_task_inactive - wait for a thread to unschedule. * * The caller must ensure that the task *will* unschedule sometime soon, * else this function might spin for a *long* time. This function can't * be called with interrupts off, or it may introduce deadlock with * smp_call_function() if an IPI is sent by the same process we are * waiting to become inactive. */ void wait_task_inactive(task_t * p) { unsigned long flags; runqueue_t *rq; int preempted; repeat: rq = task_rq_lock(p, &flags); /* Must be off runqueue entirely, not preempted. */ if (unlikely(task_queued(p))) { /* If it's preempted, we yield. It could be a while. */ preempted = !task_running(rq, p); task_rq_unlock(rq, &flags); cpu_relax(); if (preempted) yield(); goto repeat; } task_rq_unlock(rq, &flags); } /*** * kick_process - kick a running thread to enter/exit the kernel * @p: the to-be-kicked thread * * Cause a process which is running on another CPU to enter * kernel-mode, without any delay. (to get signals handled.) */ void kick_process(task_t *p) { int cpu; preempt_disable(); cpu = task_cpu(p); if ((cpu != smp_processor_id()) && task_curr(p)) smp_send_reschedule(cpu); preempt_enable(); } EXPORT_SYMBOL_GPL(kick_process); /* * Return a low guess at the load of a migration-source cpu. * * We want to under-estimate the load of migration sources, to * balance conservatively. */ static inline unsigned long source_load(int cpu) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; return min(rq->cpu_load, load_now); } /* * Return a high guess at the load of a migration-target cpu */ static inline unsigned long target_load(int cpu) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; return max(rq->cpu_load, load_now); } #endif /* * wake_idle() is useful especially on SMT architectures to wake a * task onto an idle sibling if we would otherwise wake it onto a * busy sibling. * * Returns the CPU we should wake onto. */ #if defined(ARCH_HAS_SCHED_WAKE_IDLE) static int wake_idle(int cpu, task_t *p) { cpumask_t tmp; runqueue_t *rq = cpu_rq(cpu); struct sched_domain *sd; int i; if (idle_cpu(cpu)) return cpu; sd = rq->sd; if (!(sd->flags & SD_WAKE_IDLE)) return cpu; cpus_and(tmp, sd->span, cpu_online_map); for_each_cpu_mask(i, tmp) { if (!cpu_isset(i, p->cpus_allowed)) continue; if (idle_cpu(i)) return i; } return cpu; } #else static inline int wake_idle(int cpu, task_t *p) { return cpu; } #endif /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread * @state: the mask of task states that can be woken * @sync: do a synchronous wakeup? * * Put it on the run-queue if it's not already there. The "current" * thread is always on the run-queue (except when the actual * re-schedule is in progress), and as such you're allowed to do * the simpler "current->state = TASK_RUNNING" to mark yourself * runnable without the overhead of this. * * returns failure only if the task is already active. */ static int try_to_wake_up(task_t * p, unsigned int state, int sync) { int cpu, this_cpu, success = 0; unsigned long flags; long old_state; runqueue_t *rq; int prio; #ifdef CONFIG_SMP unsigned long load, this_load; struct sched_domain *sd; int new_cpu; #endif rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) goto out; if (task_queued(p)) goto out_running; cpu = task_cpu(p); this_cpu = smp_processor_id(); #ifdef CONFIG_SMP if (unlikely(task_running(rq, p))) goto out_activate; new_cpu = cpu; if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; load = source_load(cpu); this_load = target_load(this_cpu); /* * If sync wakeup then subtract the (maximum possible) effect of * the currently running task from the load of the current CPU: */ if (sync) this_load -= SCHED_LOAD_SCALE; /* Don't pull the task off an idle CPU to a busy one */ if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) goto out_set_cpu; new_cpu = this_cpu; /* Wake to this CPU if we can */ /* * Scan domains for affine wakeup and passive balancing * possibilities. */ for_each_domain(this_cpu, sd) { unsigned int imbalance; /* * Start passive balancing when half the imbalance_pct * limit is reached. */ imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; if ( ((sd->flags & SD_WAKE_AFFINE) && !task_hot(p, rq->timestamp_last_tick, sd)) || ((sd->flags & SD_WAKE_BALANCE) && imbalance*this_load <= 100*load) ) { /* * Now sd has SD_WAKE_AFFINE and p is cache cold in sd * or sd has SD_WAKE_BALANCE and there is an imbalance */ if (cpu_isset(cpu, sd->span)) goto out_set_cpu; } } new_cpu = cpu; /* Could not wake to this_cpu. Wake to cpu instead */ out_set_cpu: new_cpu = wake_idle(new_cpu, p); if (new_cpu != cpu && cpu_isset(new_cpu, p->cpus_allowed)) { set_task_cpu(p, new_cpu); task_rq_unlock(rq, &flags); /* might preempt at this point */ rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) goto out; if (task_queued(p)) goto out_running; this_cpu = smp_processor_id(); cpu = task_cpu(p); } out_activate: #endif /* CONFIG_SMP */ if (old_state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; /* * This is the end of one scheduling cycle and the start * of the next */ update_stats_for_cycle(p, rq); if (!rt_task(p) && (sched_mode != SCHED_MODE_STAIRCASE)) { recalc_throughput_bonus(p, rq->nr_running + 1); reassess_interactiveness(p); } /* * Sync wakeups (i.e. those types of wakeups where the waker * has indicated that it will leave the CPU in short order) * don't trigger a preemption, if the woken up task will run on * this cpu. (in this case the 'I will reschedule' promise of * the waker guarantees that the freshly woken up task is going * to be considered on this CPU.) */ prio = activate_task(p, rq, cpu == this_cpu); if (!sync || cpu != this_cpu) { if (preemption_warranted(prio, p, rq)) resched_task(rq->curr); } success = 1; out_running: p->state = TASK_RUNNING; out: task_rq_unlock(rq, &flags); return success; } int fastcall wake_up_process(task_t * p) { return try_to_wake_up(p, TASK_STOPPED | TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0); } EXPORT_SYMBOL(wake_up_process); int fastcall wake_up_state(task_t *p, unsigned int state) { return try_to_wake_up(p, state, 0); } /* * Initialize the scheduling statistics counters */ static inline void initialize_stats(task_t *p) { p->avg_sleep_per_cycle = 0; p->avg_delay_per_cycle = 0; p->avg_delay_per_sub_cycle = 0; p->avg_cpu_per_cycle = 0; p->avg_cpu_per_sub_cycle = 0; p->total_sleep = 0; p->total_delay = 0; p->total_cpu = 0; p->cycle_count = 0; p->sched_timestamp = 0 /* set this to current time later */; } /* * Initialize the scheduling bonuses */ static inline void initialize_bonuses(task_t *p) { p->interactive_bonus = (max_ia_bonus >= initial_ia_bonus) ? initial_ia_bonus : max_ia_bonus; p->throughput_bonus = 0; p->sub_cycle_count = 0; } /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ void fastcall sched_fork(task_t *p) { /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that * nobody will actually run it, and a signal or other external * event cannot wake it up and insert it on the runqueue either. */ p->state = TASK_RUNNING; INIT_LIST_HEAD(&p->run_list); spin_lock_init(&p->switch_lock); #ifdef CONFIG_PREEMPT /* * During context-switch we hold precisely one spinlock, which * schedule_tail drops. (in the common case it's this_rq()->lock, * but it also can be p->switch_lock.) So we compensate with a count * of 1. Also, we want to start with kernel preemption disabled. */ p->thread_info->preempt_count = 1; #endif /* * Give the child a new timeslice */ if (sched_mode != SCHED_MODE_STAIRCASE) p->time_slice = task_timeslice(p); /* * Initialize the scheduling statistics and bonus counters */ initialize_stats(p); initialize_bonuses(p); } /* * wake_up_forked_process - wake up a freshly forked process. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created process. */ void fastcall wake_up_forked_process(task_t * p) { unsigned long flags; runqueue_t *rq = task_rq_lock(current, &flags); /* * Forked process gets no burst to prevent fork bombs. */ p->burst = 0; BUG_ON(p->state != TASK_RUNNING); set_task_cpu(p, smp_processor_id()); /* * Scheduling statistics compilation starts now */ p->sched_timestamp = rq->timestamp_last_tick; /* * Now that the idle task is back on the run queue we need extra care * to make sure that its one and only fork() doesn't end up in the idle * priority slot. Just testing for empty run list is no longer adequate. */ if (unlikely(!task_queued(current) || RUNQUEUE_IDLE(rq))) __activate_task(p, rq, effective_prio(p)); else { /* * Put the child on the same list(s) as (but ahead of) the parent */ list_add_tail(&p->run_list, ¤t->run_list); rq->nr_running++; } current->flags |= PF_FORKED; task_rq_unlock(rq, &flags); } /** * (Optionally) log scheduler statistics at exit. */ static int log_at_exit = 0; void fastcall sched_exit(task_t * p) { struct task_sched_stats stats; if (!log_at_exit) return; get_task_sched_stats(p, &stats); printk("SCHED_EXIT[%d] (%s) %llu %llu %llu %llu %lu %lu %lu %lu\n", p->pid, p->comm, stats.total_sleep, stats.total_cpu, stats.total_delay, stats.cycle_count, p->nvcsw, p->nivcsw, p->cnvcsw, p->cnivcsw); } /** * finish_task_switch - clean up after a task-switch * @prev: the thread we just switched away from. * * We enter this with the runqueue still locked, and finish_arch_switch() * will unlock it along with doing any other architecture-specific cleanup * actions. * * Note that we may have delayed dropping an mm in context_switch(). If * so, we finish that here outside of the runqueue lock. (Doing it * with the lock held can cause deadlocks; see schedule() for * details.) */ static void finish_task_switch(task_t *prev) { runqueue_t *rq = this_rq(); struct mm_struct *mm = rq->prev_mm; unsigned long prev_task_flags; rq->prev_mm = NULL; /* * A task struct has one reference for the use as "current". * If a task dies, then it sets TASK_ZOMBIE in tsk->state and calls * schedule one last time. The schedule call will never return, * and the scheduled task must drop that reference. * The test for TASK_ZOMBIE must occur while the runqueue locks are * still held, otherwise prev could be scheduled on another cpu, die * there before we look at prev->state, and then the reference would * be dropped twice. * Manfred Spraul */ prev_task_flags = prev->flags; finish_arch_switch(rq, prev); if (mm) mmdrop(mm); if (unlikely(prev_task_flags & PF_DEAD)) put_task_struct(prev); } /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. */ asmlinkage void schedule_tail(task_t *prev) { finish_task_switch(prev); if (current->set_child_tid) put_user(current->pid, current->set_child_tid); } /* * context_switch - switch to the new MM and the new * thread's register state. */ static inline task_t * context_switch(runqueue_t *rq, task_t *prev, task_t *next) { struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; if (unlikely(!mm)) { next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next); } else switch_mm(oldmm, mm, next); if (unlikely(!prev->mm)) { prev->active_mm = NULL; WARN_ON(rq->prev_mm); rq->prev_mm = oldmm; } /* Here we just switch the register state and the stack. */ switch_to(prev, next, prev); return prev; } /* * nr_running, nr_uninterruptible and nr_context_switches: * * externally visible scheduler statistics: current number of runnable * threads, current number of uninterruptible-sleeping threads, total * number of context switches performed since bootup. */ unsigned long nr_running(void) { unsigned long i, sum = 0; for_each_cpu(i) sum += cpu_rq(i)->nr_running; return sum; } unsigned long nr_uninterruptible(void) { unsigned long i, sum = 0; for_each_online_cpu(i) sum += cpu_rq(i)->nr_uninterruptible; return sum; } unsigned long long nr_context_switches(void) { unsigned long long i, sum = 0; for_each_online_cpu(i) sum += cpu_rq(i)->nr_switches; return sum; } unsigned long nr_iowait(void) { unsigned long i, sum = 0; for_each_online_cpu(i) sum += atomic_read(&cpu_rq(i)->nr_iowait); return sum; } /* * double_rq_lock - safely lock two runqueues * * Note this does not disable interrupts like task_rq_lock, * you need to do so manually before calling. */ static void double_rq_lock(runqueue_t *rq1, runqueue_t *rq2) { if (rq1 == rq2) spin_lock(&rq1->lock); else { if (rq1 < rq2) { spin_lock(&rq1->lock); spin_lock(&rq2->lock); } else { spin_lock(&rq2->lock); spin_lock(&rq1->lock); } } } /* * double_rq_unlock - safely unlock two runqueues * * Note this does not restore interrupts like task_rq_unlock, * you need to do so manually after calling. */ static void double_rq_unlock(runqueue_t *rq1, runqueue_t *rq2) { spin_unlock(&rq1->lock); if (rq1 != rq2) spin_unlock(&rq2->lock); } enum idle_type { IDLE, NOT_IDLE, NEWLY_IDLE, }; #ifdef CONFIG_SMP /* * find_idlest_cpu - find the least busy runqueue. */ static int find_idlest_cpu(const struct task_struct *p, int this_cpu, struct sched_domain *sd) { unsigned long load, min_load, this_load; int i, min_cpu; cpumask_t mask; min_cpu = UINT_MAX; min_load = ULONG_MAX; cpus_and(mask, sd->span, cpu_online_map); cpus_and(mask, mask, p->cpus_allowed); for_each_cpu_mask(i, mask) { load = target_load(i); if (load < min_load) { min_cpu = i; min_load = load; /* break out early on an idle CPU: */ if (!min_load) break; } } /* add +1 to account for the new task */ this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; /* * Would with the addition of the new task to the * current CPU there be an imbalance between this * CPU and the idlest CPU? * * Use half of the balancing threshold - new-context is * a good opportunity to balance. */ if (min_load*(100 + (sd->imbalance_pct-100)/2) < this_load*100) return min_cpu; return this_cpu; } /* * wake_up_forked_thread - wake up a freshly forked thread. * * This function will do some initial scheduler statistics housekeeping * that must be done for every newly created context, and it also does * runqueue balancing. */ void fastcall wake_up_forked_thread(task_t * p) { unsigned long flags; int this_cpu = get_cpu(), cpu; struct sched_domain *tmp, *sd = NULL; runqueue_t *this_rq = cpu_rq(this_cpu), *rq; /* * Find the largest domain that this CPU is part of that * is willing to balance on clone: */ for_each_domain(this_cpu, tmp) if (tmp->flags & SD_BALANCE_CLONE) sd = tmp; if (sd) cpu = find_idlest_cpu(p, this_cpu, sd); else cpu = this_cpu; local_irq_save(flags); lock_again: rq = cpu_rq(cpu); double_rq_lock(this_rq, rq); BUG_ON(p->state != TASK_RUNNING); /* * We did find_idlest_cpu() unlocked, so in theory * the mask could have changed - just dont migrate * in this case: */ if (unlikely(!cpu_isset(cpu, p->cpus_allowed))) { cpu = this_cpu; double_rq_unlock(this_rq, rq); goto lock_again; } set_task_cpu(p, cpu); /* * Scheduling statistics compilation starts now */ p->sched_timestamp = rq->timestamp_last_tick; if (cpu == this_cpu) { /* * Now that the idle task is back on the run queue we need * extra care to make sure that its one and only fork() doesn't * end up in the idle priority slot. Just testing for empty * run list is no longer adequate. */ if (unlikely(!task_queued(current) || RUNQUEUE_IDLE(rq))) __activate_task(p, rq, effective_prio(p)); else { list_add_tail(&p->run_list, ¤t->run_list); rq->nr_running++; } } else { int prio = effective_prio(p); /* Not the local CPU - must adjust timestamp */ p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) + rq->timestamp_last_tick; __activate_task(p, rq, prio); if (preemption_warranted(prio, p, rq)) resched_task(rq->curr); } double_rq_unlock(this_rq, rq); local_irq_restore(flags); put_cpu(); } /* * If dest_cpu is allowed for this process, migrate the task to it. * This is accomplished by forcing the cpu_allowed mask to only * allow dest_cpu, which will force the cpu onto dest_cpu. Then * the cpu_allowed mask is restored. */ static void sched_migrate_task(task_t *p, int dest_cpu) { migration_req_t req; runqueue_t *rq; unsigned long flags; rq = task_rq_lock(p, &flags); if (!cpu_isset(dest_cpu, p->cpus_allowed) || unlikely(cpu_is_offline(dest_cpu))) goto out; /* force the process onto the specified CPU */ if (migrate_task(p, dest_cpu, &req)) { /* Need to wait for migration thread (might exit: take ref). */ struct task_struct *mt = rq->migration_thread; get_task_struct(mt); task_rq_unlock(rq, &flags); wake_up_process(mt); put_task_struct(mt); wait_for_completion(&req.done); return; } out: task_rq_unlock(rq, &flags); } /* * sched_balance_exec(): find the highest-level, exec-balance-capable * domain and try to migrate the task to the least loaded CPU. * * execve() is a valuable balancing opportunity, because at this point * the task has the smallest effective memory and cache footprint. */ void sched_balance_exec(void) { struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); /* Prefer the current CPU if there's only this task running */ if (this_rq()->nr_running <= 1) goto out; for_each_domain(this_cpu, tmp) if (tmp->flags & SD_BALANCE_EXEC) sd = tmp; if (sd) { new_cpu = find_idlest_cpu(current, this_cpu, sd); if (new_cpu != this_cpu) { put_cpu(); sched_migrate_task(current, new_cpu); return; } } out: put_cpu(); } /* * double_lock_balance - lock the busiest runqueue, this_rq is locked already. */ static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest) { if (unlikely(!spin_trylock(&busiest->lock))) { if (busiest < this_rq) { spin_unlock(&this_rq->lock); spin_lock(&busiest->lock); spin_lock(&this_rq->lock); } else spin_lock(&busiest->lock); } } /* * pull_task - move a task from a remote runqueue to the local runqueue. * Both runqueues must be locked. */ static inline void pull_task(runqueue_t *src_rq, task_t *p, runqueue_t *this_rq, int this_cpu, int prio) { unsigned long long delta; dequeue_task(p); src_rq->nr_running--; delta = (src_rq->timestamp_last_tick - p->sched_timestamp); p->avg_delay_per_cycle += delta; p->avg_delay_per_sub_cycle += delta; p->total_delay += delta; set_task_cpu(p, this_cpu); this_rq->nr_running++; p->sched_timestamp = this_rq->timestamp_last_tick; enqueue_task(p, this_rq, prio); p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) + this_rq->timestamp_last_tick; /* * Note that idle threads have a prio of IDLE_PRIO, for this test * to be always true for them. */ if (preemption_warranted(prio, p, this_rq)) resched_task(this_rq->curr); } /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ static inline int can_migrate_task(const task_t *p, runqueue_t *rq, int this_cpu, struct sched_domain *sd, enum idle_type idle) { /* * We do not migrate tasks that are: * 1) running (obviously), or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ if (task_running(rq, p)) return 0; if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; /* Aggressive migration if we've failed balancing */ if (idle == NEWLY_IDLE || sd->nr_balance_failed < sd->cache_nice_tries) { if (task_hot(p, rq->timestamp_last_tick, sd)) return 0; } return 1; } /* * move_tasks tries to move up to max_nr_move tasks from busiest to this_rq, * as part of a balancing operation within "domain". Returns the number of * tasks moved. * * Called with both runqueues locked. */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, unsigned long max_nr_move, struct sched_domain *sd, enum idle_type idle) { struct list_head *head, *curr; int idx, pulled = 0; task_t *tmp; if (max_nr_move <= 0 || busiest->nr_running <= 1) goto out; /* Start searching at priority 0: */ idx = 0; skip_bitmap: if (!idx) idx = sched_find_first_bit(busiest->bitmap); else idx = find_next_bit(busiest->bitmap, IDLE_PRIO, idx); if (idx >= IDLE_PRIO) goto out; head = &busiest->queues[idx].queue; curr = head->prev; skip_queue: tmp = list_entry(curr, task_t, run_list); curr = curr->prev; if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } pull_task(busiest, tmp, this_rq, this_cpu, idx); pulled++; /* We only want to steal up to the prescribed number of tasks. */ if (pulled < max_nr_move) { if (curr != head) goto skip_queue; idx++; goto skip_bitmap; } out: return pulled; } /* * find_busiest_group finds and returns the busiest CPU group within the * domain. It calculates and returns the number of tasks which should be * moved to restore balance via the imbalance parameter. */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum idle_type idle) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; max_load = this_load = total_load = total_pwr = 0; do { cpumask_t tmp; unsigned long load; int local_group; int i, nr_cpus = 0; local_group = cpu_isset(this_cpu, group->cpumask); /* Tally up the load of all CPUs in the group */ avg_load = 0; cpus_and(tmp, group->cpumask, cpu_online_map); if (unlikely(cpus_empty(tmp))) goto nextgroup; for_each_cpu_mask(i, tmp) { /* Bias balancing toward cpus of our domain */ if (local_group) load = target_load(i); else load = source_load(i); nr_cpus++; avg_load += load; } if (!nr_cpus) goto nextgroup; total_load += avg_load; total_pwr += group->cpu_power; /* Adjust by relative CPU power of the group */ avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; if (local_group) { this_load = avg_load; this = group; goto nextgroup; } else if (avg_load > max_load) { max_load = avg_load; busiest = group; } nextgroup: group = group->next; } while (group != sd->groups); if (!busiest || this_load >= max_load) goto out_balanced; avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr; if (this_load >= avg_load || 100*max_load <= sd->imbalance_pct*this_load) goto out_balanced; /* * We're trying to get all the cpus to the average_load, so we don't * want to push ourselves above the average load, nor do we wish to * reduce the max loaded cpu below the average load, as either of these * actions would just result in more rebalancing later, and ping-pong * tasks around. Thus we look for the minimum possible imbalance. * Negative imbalances (*we* are more loaded than anyone else) will * be counted as no imbalance for these purposes -- we can't fix that * by pulling tasks to us. Be careful of negative numbers as they'll * appear as very large values with unsigned longs. */ *imbalance = min(max_load - avg_load, avg_load - this_load); /* How much load to actually move to equalise the imbalance */ *imbalance = (*imbalance * min(busiest->cpu_power, this->cpu_power)) / SCHED_LOAD_SCALE; if (*imbalance < SCHED_LOAD_SCALE - 1) { unsigned long pwr_now = 0, pwr_move = 0; unsigned long tmp; if (max_load - this_load >= SCHED_LOAD_SCALE*2) { *imbalance = 1; return busiest; } /* * OK, we don't have enough imbalance to justify moving tasks, * however we may be able to increase total CPU power used by * moving them. */ pwr_now += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load); pwr_now += this->cpu_power*min(SCHED_LOAD_SCALE, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/busiest->cpu_power; if (max_load > tmp) pwr_move += busiest->cpu_power*min(SCHED_LOAD_SCALE, max_load - tmp); /* Amount of load we'd add */ tmp = SCHED_LOAD_SCALE*SCHED_LOAD_SCALE/this->cpu_power; if (max_load < tmp) tmp = max_load; pwr_move += this->cpu_power*min(SCHED_LOAD_SCALE, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain another 8th of a CPU worth of throughput */ if (pwr_move < pwr_now + SCHED_LOAD_SCALE / 8) goto out_balanced; *imbalance = 1; return busiest; } /* Get rid of the scaling factor, rounding down as we divide */ *imbalance = (*imbalance + 1) / SCHED_LOAD_SCALE; return busiest; out_balanced: if (busiest && (idle == NEWLY_IDLE || (idle == IDLE && max_load > SCHED_LOAD_SCALE)) ) { *imbalance = 1; return busiest; } *imbalance = 0; return NULL; } /* * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static runqueue_t *find_busiest_queue(const struct sched_group *group) { cpumask_t tmp; unsigned long load, max_load = 0; runqueue_t *busiest = NULL; int i; cpus_and(tmp, group->cpumask, cpu_online_map); for_each_cpu_mask(i, tmp) { load = source_load(i); if (load > max_load) { max_load = load; busiest = cpu_rq(i); } } return busiest; } /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * * Called with this_rq unlocked. */ static int load_balance(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd, enum idle_type idle) { struct sched_group *group; runqueue_t *busiest; unsigned long imbalance; int nr_moved; spin_lock(&this_rq->lock); group = find_busiest_group(sd, this_cpu, &imbalance, idle); if (!group) goto out_balanced; busiest = find_busiest_queue(group); if (!busiest) goto out_balanced; /* * This should be "impossible", but since load * balancing is inherently racy and statistical, * it could happen in theory. */ if (unlikely(busiest == this_rq)) { WARN_ON(1); goto out_balanced; } nr_moved = 0; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is * still unbalanced. nr_moved simply stays zero, so it is * correctly treated as an imbalance. */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle); spin_unlock(&busiest->lock); } spin_unlock(&this_rq->lock); if (!nr_moved) { sd->nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { int wake = 0; spin_lock(&busiest->lock); if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; wake = 1; } spin_unlock(&busiest->lock); if (wake) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ sd->nr_balance_failed = sd->cache_nice_tries; } } else sd->nr_balance_failed = 0; /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; return nr_moved; out_balanced: spin_unlock(&this_rq->lock); /* tune up the balancing interval */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; return 0; } /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). * this_rq is locked. */ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, struct sched_domain *sd) { struct sched_group *group; runqueue_t *busiest = NULL; unsigned long imbalance; int nr_moved = 0; group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) goto out; busiest = find_busiest_queue(group); if (!busiest || busiest == this_rq) goto out; /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, NEWLY_IDLE); spin_unlock(&busiest->lock); out: return nr_moved; } /* * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) { struct sched_domain *sd; for_each_domain(this_cpu, sd) { if (sd->flags & SD_BALANCE_NEWIDLE) { if (load_balance_newidle(this_cpu, this_rq, sd)) { /* We've pulled tasks over so stop searching */ break; } } } } /* * active_load_balance is run by migration threads. It pushes a running * task off the cpu. It can be required to correctly have at least 1 task * running on each physical CPU where possible, and not have a physical / * logical imbalance. * * Called with busiest locked. */ static void active_load_balance(runqueue_t *busiest, int busiest_cpu) { struct sched_domain *sd; struct sched_group *group, *busy_group; int i; if (busiest->nr_running <= 1) return; for_each_domain(busiest_cpu, sd) if (cpu_isset(busiest->push_cpu, sd->span)) break; if (!sd) { WARN_ON(1); return; } group = sd->groups; while (!cpu_isset(busiest_cpu, group->cpumask)) group = group->next; busy_group = group; group = sd->groups; do { cpumask_t tmp; runqueue_t *rq; int push_cpu = 0; if (group == busy_group) goto next_group; cpus_and(tmp, group->cpumask, cpu_online_map); if (!cpus_weight(tmp)) goto next_group; for_each_cpu_mask(i, tmp) { if (!idle_cpu(i)) goto next_group; push_cpu = i; } rq = cpu_rq(push_cpu); /* * This condition is "impossible", but since load * balancing is inherently a bit racy and statistical, * it can trigger.. Reported by Bjorn Helgaas on a * 128-cpu setup. */ if (unlikely(busiest == rq)) goto next_group; double_lock_balance(busiest, rq); move_tasks(rq, push_cpu, busiest, 1, sd, IDLE); spin_unlock(&rq->lock); next_group: group = group->next; } while (group != sd->groups); } /* * rebalance_tick will get called every timer tick, on every CPU. * unless the current task is SCHED_FIFO * * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ /* Don't have all balancing operations going off at once */ #define CPU_OFFSET(cpu) (HZ * cpu / NR_CPUS) static void rebalance_tick(int this_cpu, runqueue_t *this_rq, enum idle_type idle) { unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; /* Update our load */ old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; /* * Round up the averaging division if load is increasing. This * prevents us from getting stuck on 9 if the load is 10, for * example. */ if (this_load > old_load) old_load++; this_rq->cpu_load = (old_load + this_load) / 2; for_each_domain(this_cpu, sd) { unsigned long interval = sd->balance_interval; if (idle != IDLE) interval *= sd->busy_factor; /* scale ms to jiffies */ interval = msecs_to_jiffies(interval); if (unlikely(!interval)) interval = 1; if (j - sd->last_balance >= interval) { if (load_balance(this_cpu, this_rq, sd, idle)) { /* We've pulled tasks over so no longer idle */ idle = NOT_IDLE; } sd->last_balance += interval; } } } static inline int needs_idle_balance(const runqueue_t *rq) { return rq->nr_running == 0; } #else /* * on UP we do not need to balance between CPUs: */ static inline void rebalance_tick(int cpu, runqueue_t *rq, enum idle_type idle) { } static inline void idle_balance(int cpu, runqueue_t *rq) { } static inline int needs_idle_balance(const runqueue_t *rq) { return 0; } #endif static inline int wake_priority_sleeper(runqueue_t *rq) { #ifdef CONFIG_SCHED_SMT /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ if (rq->nr_running) { resched_task(rq->idle); return 1; } #endif return 0; } /* * Are promotions due? */ static inline int promotions_due(const runqueue_t *rq) { return time_after_eq(jiffies, rq->next_prom_due); } /* * Assume runqueue lock is NOT already held. * This is not executed when current task is SCHED_FIFO */ static void do_promotions(runqueue_t *rq) { int idx = MAX_RT_PRIO; spin_lock(&rq->lock); for (;;) { int new_prio; idx = find_next_bit(rq->bitmap, IDLE_PRIO, idx + 1); if (idx > (IDLE_PRIO - 1)) break; new_prio = idx - 1; __list_splice(&rq->queues[idx].queue, rq->queues[new_prio].queue.prev); INIT_LIST_HEAD(&rq->queues[idx].queue); __clear_bit(idx, rq->bitmap); __set_bit(new_prio, rq->bitmap); /* * If promotion occurs from the slot * associated with rq->current_prio_slot then the * current task will be one of those promoted * so we should update rq->current_prio_slot * This will only be true for at most one slot. */ if (unlikely(idx == rq->current_prio_slot->prio)) rq->current_prio_slot = rq->queues + new_prio; } rq->next_prom_due = (jiffies + get_prom_interval(rq)); spin_unlock(&rq->lock); } DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* * This function gets called by the timer code, with HZ frequency. * We call it with interrupts disabled. * * It also gets called by the fork code, when changing the parent's * timeslices. */ void scheduler_tick(int user_ticks, int sys_ticks) { int cpu = smp_processor_id(); struct cpu_usage_stat *cpustat = &kstat_this_cpu.cpustat; runqueue_t *rq = this_rq(); task_t *p = current; rq->timestamp_last_tick = sched_clock(); if (rcu_pending(cpu)) rcu_check_callbacks(cpu, user_ticks); /* note: this timer irq context must be accounted for as well */ if (hardirq_count() - HARDIRQ_OFFSET) { cpustat->irq += sys_ticks; sys_ticks = 0; } else if (softirq_count()) { cpustat->softirq += sys_ticks; sys_ticks = 0; } if (p == rq->idle) { if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) { spin_lock(&rq->lock); if (!--rq->eb_ticks_to_decay) decay_eb_yardstick(rq); spin_unlock(&rq->lock); } if (atomic_read(&rq->nr_iowait) > 0) cpustat->iowait += sys_ticks; else cpustat->idle += sys_ticks; if (wake_priority_sleeper(rq)) goto out; rebalance_tick(cpu, rq, IDLE); return; } if (TASK_NICE(p) > 0) cpustat->nice += user_ticks; else cpustat->user += user_ticks; cpustat->system += sys_ticks; /* * SCHED_FIFO tasks never run out of timeslice. * and should not be burdened with the overhead of promotion or * a tick rebalance */ if (unlikely(p->policy == SCHED_FIFO)) return; spin_lock(&rq->lock); rq->cache_ticks++; if (sched_mode == SCHED_MODE_STAIRCASE) goto sched_staircase; if ((sched_mode == SCHED_MODE_ENTITLEMENT_BASED) && (!--rq->eb_ticks_to_decay)) decay_eb_yardstick(rq); /* * The task was running during this tick - update the * time slice counter. Note: we do not update a thread's * priority until it either goes to sleep or uses up its * timeslice. */ if (unlikely(p->policy == SCHED_RR)) { /* * RR tasks need a special form of timeslice management. */ if (!--p->time_slice) { p->time_slice = task_timeslice(p); set_tsk_need_resched(p); /* put it at the end of the queue with a minimum of fuss */ list_del_init(&p->run_list); list_add_tail(&p->run_list, &rq->current_prio_slot->queue); } goto out_unlock; } if (!--p->time_slice) { unsigned long long delta; dequeue_task(p); set_tsk_need_resched(p); p->time_slice = task_timeslice(p); apply_sched_avg_decay(&p->avg_delay_per_sub_cycle); apply_sched_avg_decay(&p->avg_cpu_per_sub_cycle); delta = (rq->timestamp_last_tick - p->sched_timestamp); p->sub_cycle_count++; p->avg_cpu_per_cycle += delta; p->avg_cpu_per_sub_cycle += delta; p->total_cpu += delta; p->sched_timestamp = rq->timestamp_last_tick; calculate_rates(p); recalc_throughput_bonus(p, rq->nr_running); reassess_cpu_boundness(p); /* * Arguably the interactive bonus should be updated here * as well. But it depends on whether we wish to encourage * interactive tasks to maintain a high bonus or CPU bound * tasks to lose some of there bonus? */ if (sched_mode == SCHED_MODE_ENTITLEMENT_BASED) calculate_eb_priority(p, rq); rq->current_prio_slot = rq->queues + effective_prio(p); enqueue_task(p, rq, rq->current_prio_slot->prio); goto out_unlock; } if (task_should_be_yardstick(p, rq)) set_eb_yardstick(rq, p); goto check_preempt; sched_staircase: if (unlikely(p->policy == SCHED_RR)) { /* * RR tasks need a special form of timeslice management. */ if (!--p->time_slice) { p->time_slice = RR_INTERVAL(); set_tsk_need_resched(p); /* put it at the end of the queue with a minimum of fuss */ list_del_init(&p->run_list); list_add_tail(&p->run_list, &rq->current_prio_slot->queue); } goto out_unlock; } /* * Tasks lose burst each time they use up a full slice(). */ if (!--p->slice) { set_tsk_need_resched(p); dequeue_task(p); dec_burst(p); p->slice = slice(p); rq->current_prio_slot = rq->queues + effective_prio(p); p->time_slice = RR_INTERVAL(); enqueue_task(p, rq, rq->current_prio_slot->prio); goto out_unlock; } /* * Tasks that run out of time_slice but still have slice left get * requeued with a lower priority && RR_INTERVAL time_slice. */ if (!--p->time_slice) { dequeue_task(p); set_tsk_need_resched(p); p->time_slice = RR_INTERVAL(); rq->current_prio_slot = rq->queues + effective_prio(p); enqueue_task(p, rq, rq->current_prio_slot->prio); goto out_unlock; } check_preempt: if (rq->preempted && rq->cache_ticks >= cache_decay_ticks) set_tsk_need_resched(p); out_unlock: spin_unlock(&rq->lock); out: rebalance_tick(cpu, rq, NOT_IDLE); if (unlikely(promotions_due(rq))) { /* * If there's less than 2 SCHED_OTHER tasks defer the next promotion */ if ((rt_task(p) ? rq->nr_running - 1 : rq->nr_running) < 2) rq->next_prom_due = (jiffies + get_prom_interval(rq)); else do_promotions(rq); } } #ifdef CONFIG_SCHED_SMT static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) { int i; struct sched_domain *sd = rq->sd; cpumask_t sibling_map; if (!(sd->flags & SD_SHARE_CPUPOWER)) return; cpus_and(sibling_map, sd->span, cpu_online_map); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq; if (i == cpu) continue; smt_rq = cpu_rq(i); /* * If an SMT sibling task is sleeping due to priority * reasons wake it up now. */ if (smt_rq->curr == smt_rq->idle && smt_rq->nr_running) resched_task(smt_rq->idle); } } static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) { struct sched_domain *sd = rq->sd; cpumask_t sibling_map; int ret = 0, i; if (!(sd->flags & SD_SHARE_CPUPOWER)) return 0; cpus_and(sibling_map, sd->span, cpu_online_map); for_each_cpu_mask(i, sibling_map) { runqueue_t *smt_rq; task_t *smt_curr; if (i == cpu) continue; smt_rq = cpu_rq(i); smt_curr = smt_rq->curr; /* * If a user task with lower static priority than the * running task on the SMT sibling is trying to schedule, * delay it till there is proportionately less timeslice * left of the sibling task to prevent a lower priority * task from using an unfair proportion of the * physical cpu's resources. -ck */ if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(p) || rt_task(smt_curr)) && p->mm && smt_curr->mm && !rt_task(p)) ret = 1; /* * Reschedule a lower priority task on the SMT sibling, * or wake it up if it has been put to sleep for priority * reasons. */ if ((((p->time_slice * (100 - sd->per_cpu_gain) / 100) > task_timeslice(smt_curr) || rt_task(p)) && smt_curr->mm && p->mm && !rt_task(smt_curr)) || (smt_curr == smt_rq->idle && smt_rq->nr_running)) resched_task(smt_curr); } return ret; } static inline int dependent_idle(const runqueue_t *rq, const task_t *p) { return p == rq->idle; } #else static inline void wake_sleeping_dependent(int cpu, runqueue_t *rq) { } static inline int dependent_sleeper(int cpu, runqueue_t *rq, task_t *p) { return 0; } static inline int dependent_idle(const runqueue_t *rq, const task_t *p) { return 0; } #endif /* * schedule() is the main scheduler function. */ asmlinkage void __sched schedule(void) { long *switch_count; task_t *prev, *next; runqueue_t *rq; int cpu; unsigned long long delta; /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ if (likely(!(current->state & (TASK_DEAD | TASK_ZOMBIE)))) { if (unlikely(in_atomic())) { printk(KERN_ERR "bad: scheduling while atomic!\n"); dump_stack(); } } need_resched: preempt_disable(); prev = current; rq = this_rq(); release_kernel_lock(prev); spin_lock_irq(&rq->lock); rq->timestamp_last_tick = sched_clock(); prev->runtime = rq->timestamp_last_tick - prev->timestamp; /* * if entering off of a kernel preemption go straight * to picking the next task. */ switch_count = &prev->nivcsw; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && unlikely(signal_pending(prev)))) prev->state = TASK_RUNNING; else deactivate_task(prev, rq); } cpu = smp_processor_id(); if (unlikely(needs_idle_balance(rq))) idle_balance(cpu, rq); rq->current_prio_slot = rq->queues + sched_find_first_bit(rq->bitmap); next = list_entry(rq->current_prio_slot->queue.next, task_t, run_list); if (dependent_idle(rq, next)) { wake_sleeping_dependent(cpu, rq); goto switch_tasks; } if (dependent_sleeper(cpu, rq, next)) { rq->current_prio_slot = rq->queues + IDLE_PRIO; next = rq->idle; } switch_tasks: prefetch(next); clear_tsk_need_resched(prev); RCU_qsctr(task_cpu(prev))++; /* * Update estimate of average CPU time used per cycle */ delta = (rq->timestamp_last_tick - prev->sched_timestamp); prev->avg_cpu_per_cycle += delta; prev->avg_cpu_per_sub_cycle += delta; prev->total_cpu += delta; prev->timestamp = prev->sched_timestamp = rq->timestamp_last_tick; if (unlikely(next->flags & PF_YIELDED)) { next->flags &= ~PF_YIELDED; dequeue_task(next); rq->current_prio_slot = rq->queues + effective_prio(next); enqueue_task_head(next, rq, rq->current_prio_slot->prio); } if (likely(prev != next)) { rq->preempted = 0; rq->cache_ticks = 0; /* * Update estimate of average delay on run queue per cycle */ delta = (rq->timestamp_last_tick - next->sched_timestamp); next->avg_delay_per_cycle += delta; next->avg_delay_per_sub_cycle += delta; next->total_delay += delta; next->timestamp = next->sched_timestamp = rq->timestamp_last_tick; rq->total_delay += delta; rq->nr_switches++; rq->curr = next; ++*switch_count; prepare_arch_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); finish_task_switch(prev); } else spin_unlock_irq(&rq->lock); reacquire_kernel_lock(current); preempt_enable_no_resched(); if (test_thread_flag(TIF_NEED_RESCHED)) goto need_resched; } EXPORT_SYMBOL(schedule); #ifdef CONFIG_PREEMPT /* * this is is the entry point to schedule() from in-kernel preemption * off of preempt_enable. Kernel preemptions off return from interrupt * occur there and call schedule directly. */ asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); /* * If there is a non-zero preempt_count or interrupts are disabled, * we do not want to preempt the current task. Just return.. */ if (unlikely(ti->preempt_count || irqs_disabled())) return; need_resched: ti->preempt_count = PREEMPT_ACTIVE; schedule(); ti->preempt_count = 0; /* we could miss a preemption opportunity between schedule and now */ barrier(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; } EXPORT_SYMBOL(preempt_schedule); #endif /* CONFIG_PREEMPT */ int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { task_t *p = curr->task; return try_to_wake_up(p, mode, sync); } EXPORT_SYMBOL(default_wake_function); /* * The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just * wake everything up. If it's an exclusive wakeup (nr_exclusive == small +ve * number) then we wake all the non-exclusive tasks and one exclusive task. * * There are circumstances in which we can try to wake a task which has already * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { struct list_head *tmp, *next; list_for_each_safe(tmp, next, &q->task_list) { wait_queue_t *curr; unsigned flags; curr = list_entry(tmp, wait_queue_t, task_list); flags = curr->flags; if (curr->func(curr, mode, sync, key) && (flags & WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) break; } } /** * __wake_up - wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up */ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) { unsigned long flags; spin_lock_irqsave(&q->lock, flags); __wake_up_common(q, mode, nr_exclusive, 0, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(__wake_up); /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) { __wake_up_common(q, mode, 1, 0, NULL); } /** * __wake_up - sync- wake up threads blocked on a waitqueue. * @q: the waitqueue * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * * The sync wakeup differs that the waker knows that it will schedule * away soon, so while the target thread will be woken up, it will not * be migrated to another CPU - ie. the two threads are 'synchronized' * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. */ void fastcall __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive) { unsigned long flags; int sync = 1; if (unlikely(!q)) return; if (unlikely(!nr_exclusive)) sync = 0; spin_lock_irqsave(&q->lock, flags); __wake_up_common(q, mode, nr_exclusive, sync, NULL); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ void fastcall complete(struct completion *x) { unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); x->done++; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete); void fastcall complete_all(struct completion *x) { unsigned long flags; spin_lock_irqsave(&x->wait.lock, flags); x->done += UINT_MAX/2; __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0, 0, NULL); spin_unlock_irqrestore(&x->wait.lock, flags); } EXPORT_SYMBOL(complete_all); void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); spin_lock_irq(&x->wait.lock); if (!x->done) { DECLARE_WAITQUEUE(wait, current); wait.flags |= WQ_FLAG_EXCLUSIVE; __add_wait_queue_tail(&x->wait, &wait); do { __set_current_state(TASK_UNINTERRUPTIBLE); spin_unlock_irq(&x->wait.lock); schedule(); spin_lock_irq(&x->wait.lock); } while (!x->done); __remove_wait_queue(&x->wait, &wait); } x->done--; spin_unlock_irq(&x->wait.lock); } EXPORT_SYMBOL(wait_for_completion); #define SLEEP_ON_VAR \ unsigned long flags; \ wait_queue_t wait; \ init_waitqueue_entry(&wait, current); #define SLEEP_ON_HEAD \ spin_lock_irqsave(&q->lock,flags); \ __add_wait_queue(q, &wait); \ spin_unlock(&q->lock); #define SLEEP_ON_TAIL \ spin_lock_irq(&q->lock); \ __remove_wait_queue(q, &wait); \ spin_unlock_irqrestore(&q->lock, flags); void fastcall __sched interruptible_sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR current->state = TASK_INTERRUPTIBLE; SLEEP_ON_HEAD schedule(); SLEEP_ON_TAIL } EXPORT_SYMBOL(interruptible_sleep_on); long fastcall __sched interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR current->state = TASK_INTERRUPTIBLE; SLEEP_ON_HEAD timeout = schedule_timeout(timeout); SLEEP_ON_TAIL return timeout; } EXPORT_SYMBOL(interruptible_sleep_on_timeout); void fastcall __sched sleep_on(wait_queue_head_t *q) { SLEEP_ON_VAR current->state = TASK_UNINTERRUPTIBLE; SLEEP_ON_HEAD schedule(); SLEEP_ON_TAIL } EXPORT_SYMBOL(sleep_on); long fastcall __sched sleep_on_timeout(wait_queue_head_t *q, long timeout) { SLEEP_ON_VAR current->state = TASK_UNINTERRUPTIBLE; SLEEP_ON_HEAD timeout = schedule_timeout(timeout); SLEEP_ON_TAIL return timeout; } EXPORT_SYMBOL(sleep_on_timeout); void set_user_nice(task_t *p, long nice) { unsigned long flags; runqueue_t *rq; int queued, delta; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; /* * We have to be careful, if called from sys_setpriority(), * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); /* * The RT priorities are set via setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected * it wont have any effect on scheduling until the task is * not SCHED_NORMAL: */ if ((queued = (!rt_task(p) && task_queued(p)))) dequeue_task(p); delta = PRIO_TO_NICE(p->static_prio) - nice; p->static_prio = NICE_TO_PRIO(nice); p->eb_shares = nice_to_shares(nice); if (queued) { int new_prio = effective_prio(p); enqueue_task(p, rq, new_prio); if (task_running(rq, p)) rq->current_prio_slot = rq->queues + new_prio; /* * If the task increased its setting or is running and lowered * its setting, then reschedule its CPU: */ if ((delta > 0) || ((delta < 0) && task_running(rq, p))) resched_task(rq->curr); } task_rq_unlock(rq, &flags); } EXPORT_SYMBOL(set_user_nice); #ifdef __ARCH_WANT_SYS_NICE /* * sys_nice - change the priority of the current process. * @increment: priority increment * * sys_setpriority is a more generic, but much slower function that * does similar things. */ asmlinkage long sys_nice(int increment) { int retval; long nice; /* * Setpriority might change our priority at the same moment. * We don't have to worry. Conceptually one call occurs first * and we have a single winner. */ if (increment < 0) { if (!capable(CAP_SYS_NICE)) return -EPERM; if (increment < -40) increment = -40; } if (increment > 40) increment = 40; nice = PRIO_TO_NICE(current->static_prio) + increment; if (nice < -20) nice = -20; if (nice > 19) nice = 19; retval = security_task_setnice(current, nice); if (retval) return retval; set_user_nice(current, nice); return 0; } #endif /** * task_prio - return the priority value of a given task. * @p: the task in question. * * This is the priority value as seen by users in /proc. * RT tasks are offset by -200. Normal tasks are centered * around 0, value goes from -16 to +15. */ int task_prio(task_t *p) { return effective_prio(p) - MAX_RT_PRIO; } /** * task_nice - return the nice value of a given task. * @p: the task in question. */ int task_nice(task_t *p) { return TASK_NICE(p); } EXPORT_SYMBOL(task_nice); /** * idle_cpu - is a given cpu idle currently? * @cpu: the processor in question. */ int idle_cpu(int cpu) { return cpu_curr(cpu) == cpu_rq(cpu)->idle; } EXPORT_SYMBOL_GPL(idle_cpu); /** * find_process_by_pid - find a process with a matching PID value. * @pid: the pid in question. */ static inline task_t *find_process_by_pid(pid_t pid) { return pid ? find_task_by_pid(pid) : current; } /* Actually do priority change: must hold rq lock. */ static void __setscheduler(struct task_struct *p, int policy, int prio) { BUG_ON(task_queued(p)); p->policy = policy; p->rt_priority = prio; } /* * setscheduler - change the scheduling policy and/or RT priority of a thread. */ static int setscheduler(pid_t pid, int policy, struct sched_param __user *param) { struct sched_param lp; int retval = -EINVAL; int queued; unsigned long flags; runqueue_t *rq; task_t *p; if (!param || pid < 0) goto out_nounlock; retval = -EFAULT; if (copy_from_user(&lp, param, sizeof(struct sched_param))) goto out_nounlock; /* * We play safe to avoid deadlocks. */ read_lock_irq(&tasklist_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) goto out_unlock_tasklist; /* * To be able to change p->policy safely, the apropriate * runqueue lock must be held. */ rq = task_rq_lock(p, &flags); if (policy < 0) policy = p->policy; else { retval = -EINVAL; if (policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL) goto out_unlock; } /* * Valid priorities for SCHED_FIFO and SCHED_RR are * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ retval = -EINVAL; if (lp.sched_priority < 0 || lp.sched_priority > MAX_USER_RT_PRIO-1) goto out_unlock; if ((policy == SCHED_NORMAL) != (lp.sched_priority == 0)) goto out_unlock; retval = -EPERM; if ((policy == SCHED_FIFO || policy == SCHED_RR) && !capable(CAP_SYS_NICE)) goto out_unlock; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = security_task_setscheduler(p, policy, &lp); if (retval) goto out_unlock; if ((queued = task_queued(p))) deactivate_task(p, task_rq(p)); retval = 0; __setscheduler(p, policy, lp.sched_priority); if (queued) { int prio = effective_prio(p); __activate_task(p, task_rq(p), prio); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on * this runqueue and our priority is higher than the current's */ if (preemption_warranted(prio, p, rq)) resched_task(rq->curr); if (task_running(rq, p)) rq->current_prio_slot = rq->queues + prio; } out_unlock: task_rq_unlock(rq, &flags); out_unlock_tasklist: read_unlock_irq(&tasklist_lock); out_nounlock: return retval; } /** * sys_sched_setscheduler - set/change the scheduler policy and RT priority * @pid: the pid in question. * @policy: new policy * @param: structure containing the new RT priority. */ asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) { return setscheduler(pid, policy, param); } /** * sys_sched_setparam - set/change the RT priority of a thread * @pid: the pid in question. * @param: structure containing the new RT priority. */ asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param __user *param) { return setscheduler(pid, -1, param); } /** * sys_sched_getscheduler - get the policy (scheduling class) of a thread * @pid: the pid in question. */ asmlinkage long sys_sched_getscheduler(pid_t pid) { int retval = -EINVAL; task_t *p; if (pid < 0) goto out_nounlock; retval = -ESRCH; read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (p) { retval = security_task_getscheduler(p); if (!retval) retval = p->policy; } read_unlock(&tasklist_lock); out_nounlock: return retval; } /** * sys_sched_getscheduler - get the RT priority of a thread * @pid: the pid in question. * @param: structure containing the RT priority. */ asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param __user *param) { struct sched_param lp; int retval = -EINVAL; task_t *p; if (!param || pid < 0) goto out_nounlock; read_lock(&tasklist_lock); p = find_process_by_pid(pid); retval = -ESRCH; if (!p) goto out_unlock; retval = security_task_getscheduler(p); if (retval) goto out_unlock; lp.sched_priority = p->rt_priority; read_unlock(&tasklist_lock); /* * This one might sleep, we cannot do it with a spinlock held ... */ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; out_nounlock: return retval; out_unlock: read_unlock(&tasklist_lock); return retval; } /** * sys_sched_setaffinity - set the cpu affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to the new cpu mask */ asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { cpumask_t new_mask; int retval; task_t *p; if (len < sizeof(new_mask)) return -EINVAL; if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) return -EFAULT; lock_cpu_hotplug(); read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (!p) { read_unlock(&tasklist_lock); unlock_cpu_hotplug(); return -ESRCH; } /* * It is not safe to call set_cpus_allowed with the * tasklist_lock held. We will bump the task_struct's * usage count and then drop tasklist_lock. */ get_task_struct(p); read_unlock(&tasklist_lock); retval = -EPERM; if ((current->euid != p->euid) && (current->euid != p->uid) && !capable(CAP_SYS_NICE)) goto out_unlock; retval = set_cpus_allowed(p, new_mask); out_unlock: put_task_struct(p); unlock_cpu_hotplug(); return retval; } /** * sys_sched_getaffinity - get the cpu affinity of a process * @pid: pid of the process * @len: length in bytes of the bitmask pointed to by user_mask_ptr * @user_mask_ptr: user-space pointer to hold the current cpu mask */ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, unsigned long __user *user_mask_ptr) { unsigned int real_len; cpumask_t mask; int retval; task_t *p; real_len = sizeof(mask); if (len < real_len) return -EINVAL; lock_cpu_hotplug(); read_lock(&tasklist_lock); retval = -ESRCH; p = find_process_by_pid(pid); if (!p) goto out_unlock; retval = 0; cpus_and(mask, p->cpus_allowed, cpu_possible_map); out_unlock: read_unlock(&tasklist_lock); unlock_cpu_hotplug(); if (retval) return retval; if (copy_to_user(user_mask_ptr, &mask, real_len)) return -EFAULT; return real_len; } void get_task_sched_stats(const struct task_struct *tsk, struct task_sched_stats *stats) { int on_runq = 0; int on_cpu = 0; unsigned long long timestamp; runqueue_t *rq = this_rq_lock(); stats->timestamp = rq->timestamp_last_tick; stats->avg_sleep_per_cycle = tsk->avg_sleep_per_cycle; stats->avg_delay_per_cycle = tsk->avg_delay_per_cycle; stats->avg_cpu_per_cycle = tsk->avg_cpu_per_cycle; stats->cycle_count = tsk->cycle_count; stats->total_sleep = tsk->total_sleep; stats->total_cpu = tsk->total_cpu; stats->total_delay = tsk->total_delay; timestamp = tsk->sched_timestamp; if ((on_runq = task_queued(tsk))) on_cpu = rq->idle == tsk; rq_unlock(rq); /* * Update values to the previous tick (only) */ if (stats->timestamp > timestamp) { unsigned long long delta = stats->timestamp - timestamp; if (on_cpu) { stats->avg_cpu_per_cycle += delta; stats->total_cpu += delta; } else if (on_runq) { stats->avg_delay_per_cycle += delta; stats->total_delay += delta; } else { stats->avg_sleep_per_cycle += delta; stats->total_sleep += delta; } } /* * Convert internal "real number" representation of average times * to integer values in nanoseconds */ stats->avg_sleep_per_cycle = SCHED_AVG_RND(stats->avg_sleep_per_cycle); stats->avg_cpu_per_cycle = SCHED_AVG_RND(stats->avg_cpu_per_cycle); stats->avg_delay_per_cycle = SCHED_AVG_RND(stats->avg_delay_per_cycle); } EXPORT_SYMBOL(get_task_sched_stats); /* * Get scheduling statistics for the nominated CPU */ void get_cpu_sched_stats(unsigned int cpu, struct cpu_sched_stats *stats) { int idle; unsigned long long idle_timestamp; runqueue_t *rq = cpu_rq(cpu); /* * No need to crash the whole machine if they've asked for stats for * a non existent CPU, just send back zero. */ if (rq == NULL) { stats->timestamp = 0; stats->total_idle = 0; stats->total_busy = 0; stats->total_delay = 0; stats->nr_switches = 0; return; } local_irq_disable(); spin_lock(&rq->lock); idle = rq->curr == rq->idle; stats->timestamp = rq->timestamp_last_tick; idle_timestamp = rq->idle->sched_timestamp; stats->total_idle = rq->idle->total_cpu; stats->total_busy = rq->idle->total_delay; stats->total_delay = rq->total_delay; stats->nr_switches = rq->nr_switches; rq_unlock(rq); /* * Update idle/busy time to the current tick */ if (idle) stats->total_idle += (stats->timestamp - idle_timestamp); else stats->total_busy += (stats->timestamp - idle_timestamp); } EXPORT_SYMBOL(get_cpu_sched_stats); /** * sys_sched_yield - yield the current processor to other threads. * * CPU then this function will return. */ asmlinkage long sys_sched_yield(void) { runqueue_t *rq = this_rq_lock(); if (sched_mode == SCHED_MODE_STAIRCASE) goto yield_staircase; /* * (special rule: RT tasks will just roundrobin in the active * array.) */ if (likely(!rt_task(current))) { /* If there's other tasks on this CPU make sure that as many of * them as possible/judicious get some CPU before this task */ dequeue_task(current); current->flags |= PF_YIELDED; rq->current_prio_slot = rq->queues + (IDLE_PRIO - 1); enqueue_task(current, rq, rq->current_prio_slot->prio); } else { list_del_init(¤t->run_list); list_add_tail(¤t->run_list, &rq->current_prio_slot->queue); } goto out; yield_staircase: dequeue_task(current); current->slice = slice(current); current->time_slice = RR_INTERVAL(); if (!rt_task(current)) { current->flags |= PF_YIELDED; rq->current_prio_slot = rq->queues + MAX_PRIO - 1; } current->burst = 0; enqueue_task(current, rq, rq->current_prio_slot->prio); out: /* * Since we are going to call schedule() anyway, there's * no need to preempt or enable interrupts: */ _raw_spin_unlock(&rq->lock); preempt_enable_no_resched(); schedule(); return 0; } void __sched __cond_resched(void) { set_current_state(TASK_RUNNING); schedule(); } EXPORT_SYMBOL(__cond_resched); /** * yield - yield the current processor to other threads. * * this is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ void __sched yield(void) { set_current_state(TASK_RUNNING); sys_sched_yield(); } EXPORT_SYMBOL(yield); /* * This task is about to go to sleep on IO. Increment rq->nr_iowait so * that process accounting knows that this is a task in IO wait state. * * But don't do that if it is a deliberate, throttling IO wait (this task * has set its backing_dev_info: the queue against which it should throttle) */ void __sched io_schedule(void) { struct runqueue *rq = this_rq(); atomic_inc(&rq->nr_iowait); schedule(); atomic_dec(&rq->nr_iowait); } EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { struct runqueue *rq = this_rq(); long ret; atomic_inc(&rq->nr_iowait); ret = schedule_timeout(timeout); atomic_dec(&rq->nr_iowait); return ret; } /** * sys_sched_get_priority_max - return maximum RT priority. * @policy: scheduling class. * * this syscall returns the maximum rt_priority that can be used * by a given scheduling class. */ asmlinkage long sys_sched_get_priority_max(int policy) { int ret = -EINVAL; switch (policy) { case SCHED_FIFO: case SCHED_RR: ret = MAX_USER_RT_PRIO-1; break; case SCHED_NORMAL: ret = 0; break; } return ret; } /** * sys_sched_get_priority_min - return minimum RT priority. * @policy: scheduling class. * * this syscall returns the minimum rt_priority that can be used * by a given scheduling class. */ asmlinkage long sys_sched_get_priority_min(int policy) { int ret = -EINVAL; switch (policy) { case SCHED_FIFO: case SCHED_RR: ret = 1; break; case SCHED_NORMAL: ret = 0; } return ret; } /** * sys_sched_rr_get_interval - return the default timeslice of a process. * @pid: pid of the process. * @interval: userspace pointer to the timeslice value. * * this syscall writes the default timeslice value of a given process * into the user-space timespec buffer. A value of '0' means infinity. */ asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec __user *interval) { int retval = -EINVAL; struct timespec t; task_t *p; if (pid < 0) goto out_nounlock; retval = -ESRCH; read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (!p) goto out_unlock; retval = security_task_getscheduler(p); if (retval) goto out_unlock; jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : task_timeslice(p), &t); read_unlock(&tasklist_lock); retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; out_nounlock: return retval; out_unlock: read_unlock(&tasklist_lock); return retval; } static inline struct task_struct *eldest_child(struct task_struct *p) { if (list_empty(&p->children)) return NULL; return list_entry(p->children.next,struct task_struct,sibling); } static inline struct task_struct *older_sibling(struct task_struct *p) { if (p->sibling.prev==&p->parent->children) return NULL; return list_entry(p->sibling.prev,struct task_struct,sibling); } static inline struct task_struct *younger_sibling(struct task_struct *p) { if (p->sibling.next==&p->parent->children) return NULL; return list_entry(p->sibling.next,struct task_struct,sibling); } static void show_task(task_t * p) { task_t *relative; unsigned state; unsigned long free = 0; static const char *stat_nam[] = { "R", "S", "D", "T", "Z", "W" }; printk("%-13.13s ", p->comm); state = p->state ? __ffs(p->state) + 1 : 0; if (state < ARRAY_SIZE(stat_nam)) printk(stat_nam[state]); else printk("?"); #if (BITS_PER_LONG == 32) if (state == TASK_RUNNING) printk(" running "); else printk(" %08lX ", thread_saved_pc(p)); #else if (state == TASK_RUNNING) printk(" running task "); else printk(" %016lx ", thread_saved_pc(p)); #endif #ifdef CONFIG_DEBUG_STACK_USAGE { unsigned long * n = (unsigned long *) (p->thread_info+1); while (!*n) n++; free = (unsigned long) n - (unsigned long)(p->thread_info+1); } #endif printk("%5lu %5d %6d ", free, p->pid, p->parent->pid); if ((relative = eldest_child(p))) printk("%5d ", relative->pid); else printk(" "); if ((relative = younger_sibling(p))) printk("%7d", relative->pid); else printk(" "); if ((relative = older_sibling(p))) printk(" %5d", relative->pid); else printk(" "); if (!p->mm) printk(" (L-TLB)\n"); else printk(" (NOTLB)\n"); if (state != TASK_RUNNING) show_stack(p, NULL); } void show_state(void) { task_t *g, *p; #if (BITS_PER_LONG == 32) printk("\n" " sibling\n"); printk(" task PC pid father child younger older\n"); #else printk("\n" " sibling\n"); printk(" task PC pid father child younger older\n"); #endif read_lock(&tasklist_lock); do_each_thread(g, p) { /* * reset the NMI-timeout, listing all files on a slow * console might take alot of time: */ touch_nmi_watchdog(); show_task(p); } while_each_thread(g, p); read_unlock(&tasklist_lock); } void __devinit init_idle(task_t *idle, int cpu) { runqueue_t *idle_rq = cpu_rq(cpu), *rq = cpu_rq(task_cpu(idle)); unsigned long flags; local_irq_save(flags); double_rq_lock(idle_rq, rq); idle_rq->curr = idle_rq->idle = idle; deactivate_task(idle, rq); /* * Initialize scheduling statistics counters as they may provide * valuable about the CPU e.g. avg_cpu_time_per_cycle for the idle * task will be an estimate of the average time the CPU is idle */ initialize_stats(idle); initialize_bonuses(idle); idle->sched_timestamp = rq->timestamp_last_tick; idle->state = TASK_RUNNING; idle->burst = 0; set_task_cpu(idle, cpu); /* * Putting the idle process onto a run queue simplifies the selection of * the next task to run in schedule(). */ list_add_tail(&idle->run_list, &idle_rq->queues[IDLE_PRIO].queue); /* * The idle task is the current task on idle_rq */ idle_rq->current_prio_slot = idle_rq->queues + IDLE_PRIO; double_rq_unlock(idle_rq, rq); set_tsk_need_resched(idle); local_irq_restore(flags); /* Set the preempt count _outside_ the spinlocks! */ #ifdef CONFIG_PREEMPT idle->thread_info->preempt_count = (idle->lock_depth >= 0); #else idle->thread_info->preempt_count = 0; #endif } /* * In a system that switches off the HZ timer nohz_cpu_mask * indicates which cpus entered this state. This is used * in the rcu update to wait only for active cpus. For system * which do not switch off the HZ timer nohz_cpu_mask should * always be CPU_MASK_NONE. */ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; #ifdef CONFIG_SMP /* * This is how migration works: * * 1) we queue a migration_req_t structure in the source CPU's * runqueue and wake up that CPU's migration thread. * 2) we down() the locked semaphore => thread blocks. * 3) migration thread wakes up (implicitly it forces the migrated * thread off the CPU) * 4) it gets the migration request and checks whether the migrated * task is still in the wrong runqueue. * 5) if it's in the wrong runqueue then the migration thread removes * it and puts it into the right queue. * 6) migration thread up()s the semaphore. * 7) we wake up and the migration is done. */ /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on * is removed from the allowed bitmask. * * NOTE: the caller must have a valid reference to the task, the * task must not exit() & deallocate itself prematurely. The * call is not atomic; no spinlocks may be held. */ int set_cpus_allowed(task_t *p, cpumask_t new_mask) { unsigned long flags; int ret = 0; migration_req_t req; runqueue_t *rq; rq = task_rq_lock(p, &flags); if (any_online_cpu(new_mask) == NR_CPUS) { ret = -EINVAL; goto out; } p->cpus_allowed = new_mask; /* Can the task run on the task's current CPU? If so, we're done */ if (cpu_isset(task_cpu(p), new_mask)) goto out; if (migrate_task(p, any_online_cpu(new_mask), &req)) { /* Need help from migration thread: drop lock and wait. */ task_rq_unlock(rq, &flags); wake_up_process(rq->migration_thread); wait_for_completion(&req.done); return 0; } out: task_rq_unlock(rq, &flags); return ret; } EXPORT_SYMBOL_GPL(set_cpus_allowed); /* * Move (not current) task off this cpu, onto dest cpu. We're doing * this because either it can't run here any more (set_cpus_allowed() * away from this CPU, or CPU going down), or because we're * attempting to rebalance this task on exec (sched_balance_exec). * * So we race with normal scheduler movements, but that's OK, as long * as the task is no longer on this CPU. */ static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) { runqueue_t *rq_dest, *rq_src; if (unlikely(cpu_is_offline(dest_cpu))) return; rq_src = cpu_rq(src_cpu); rq_dest = cpu_rq(dest_cpu); double_rq_lock(rq_src, rq_dest); /* Already moved. */ if (task_cpu(p) != src_cpu) goto out; /* Affinity changed (again). */ if (!cpu_isset(dest_cpu, p->cpus_allowed)) goto out; if (task_queued(p)) { unsigned long long delta; /* * Sync timestamp with rq_dest's before activating. * The same thing could be achieved by doing this step * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ p->timestamp = p->timestamp - rq_src->timestamp_last_tick + rq_dest->timestamp_last_tick; deactivate_task(p, rq_src); /* * Do set_task_cpu() until AFTER we dequeue the task, since * dequeue_task() relies on task_cpu() always being accurate. */ set_task_cpu(p, dest_cpu); delta = (rq_dest->timestamp_last_tick - p->sched_timestamp); p->avg_delay_per_cycle += delta; p->avg_delay_per_sub_cycle += delta; p->total_delay += delta; if (preemption_warranted(activate_task(p, rq_dest, 0), p, rq_dest)) resched_task(rq_dest->curr); } else { unsigned long long delta; set_task_cpu(p, dest_cpu); delta = (rq_dest->timestamp_last_tick - p->sched_timestamp); p->avg_sleep_per_cycle += delta; p->total_sleep += delta; } p->sched_timestamp = rq_dest->timestamp_last_tick; out: double_rq_unlock(rq_src, rq_dest); } /* * migration_thread - this is a highprio system thread that performs * thread migration by bumping thread off CPU then 'pushing' onto * another runqueue. */ static int migration_thread(void * data) { runqueue_t *rq; int cpu = (long)data; rq = cpu_rq(cpu); BUG_ON(rq->migration_thread != current); set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { struct list_head *head; migration_req_t *req; if (current->flags & PF_FREEZE) refrigerator(PF_FREEZE); spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { spin_unlock_irq(&rq->lock); goto wait_to_die; } if (rq->active_balance) { active_load_balance(rq, cpu); rq->active_balance = 0; } head = &rq->migration_queue; if (list_empty(head)) { spin_unlock_irq(&rq->lock); schedule(); set_current_state(TASK_INTERRUPTIBLE); continue; } req = list_entry(head->next, migration_req_t, list); list_del_init(head->next); if (req->type == REQ_MOVE_TASK) { spin_unlock(&rq->lock); __migrate_task(req->task, smp_processor_id(), req->dest_cpu); local_irq_enable(); } else if (req->type == REQ_SET_DOMAIN) { rq->sd = req->sd; spin_unlock_irq(&rq->lock); } else { spin_unlock_irq(&rq->lock); WARN_ON(1); } complete(&req->done); } __set_current_state(TASK_RUNNING); return 0; wait_to_die: /* Wait for kthread_stop */ set_current_state(TASK_INTERRUPTIBLE); while (!kthread_should_stop()) { schedule(); set_current_state(TASK_INTERRUPTIBLE); } __set_current_state(TASK_RUNNING); return 0; } #ifdef CONFIG_HOTPLUG_CPU /* migrate_all_tasks - function to migrate all tasks from the dead cpu. */ static void migrate_all_tasks(int src_cpu) { struct task_struct *tsk, *t; int dest_cpu; unsigned int node; write_lock_irq(&tasklist_lock); /* watch out for per node tasks, let's stay on this node */ node = cpu_to_node(src_cpu); do_each_thread(t, tsk) { cpumask_t mask; if (tsk == current) continue; if (task_cpu(tsk) != src_cpu) continue; /* Figure out where this task should go (attempting to * keep it on-node), and check if it can be migrated * as-is. NOTE that kernel threads bound to more than * one online cpu will be migrated. */ mask = node_to_cpumask(node); cpus_and(mask, mask, tsk->cpus_allowed); dest_cpu = any_online_cpu(mask); if (dest_cpu == NR_CPUS) dest_cpu = any_online_cpu(tsk->cpus_allowed); if (dest_cpu == NR_CPUS) { cpus_clear(tsk->cpus_allowed); cpus_complement(tsk->cpus_allowed); dest_cpu = any_online_cpu(tsk->cpus_allowed); /* Don't tell them about moving exiting tasks or kernel threads (both mm NULL), since they never leave kernel. */ if (tsk->mm && printk_ratelimit()) printk(KERN_INFO "process %d (%s) no " "longer affine to cpu%d\n", tsk->pid, tsk->comm, src_cpu); } __migrate_task(tsk, src_cpu, dest_cpu); } while_each_thread(t, tsk); write_unlock_irq(&tasklist_lock); } /* Schedules idle task to be the next runnable task on current CPU. * It does so by boosting its priority to highest possible and adding it to * the _front_ of runqueue. Used by CPU offline code. */ void sched_idle_next(void) { int cpu = smp_processor_id(); runqueue_t *rq = this_rq(); struct task_struct *p = rq->idle; unsigned long flags; /* cpu has to be offline */ BUG_ON(cpu_online(cpu)); /* Strictly not necessary since rest of the CPUs are stopped by now * and interrupts disabled on current cpu. */ spin_lock_irqsave(&rq->lock, flags); /* Add idle task to _front_ of it's priority queue */ dequeue_task(p); __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); enqueue_task_head(p, rq, 0); rq->nr_running++; spin_unlock_irqrestore(&rq->lock, flags); } #endif /* CONFIG_HOTPLUG_CPU */ /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. */ static int migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) { int cpu = (long)hcpu; struct task_struct *p; struct runqueue *rq; unsigned long flags; switch (action) { case CPU_UP_PREPARE: p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); if (IS_ERR(p)) return NOTIFY_BAD; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); __setscheduler(p, SCHED_FIFO, MAX_RT_PRIO-1); task_rq_unlock(rq, &flags); cpu_rq(cpu)->migration_thread = p; break; case CPU_ONLINE: /* Strictly unneccessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: /* Unbind it from offline cpu so it can run. Fall thru. */ kthread_bind(cpu_rq(cpu)->migration_thread,smp_processor_id()); kthread_stop(cpu_rq(cpu)->migration_thread); cpu_rq(cpu)->migration_thread = NULL; break; case CPU_DEAD: migrate_all_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); rq->migration_thread = NULL; /* Idle task back to normal in IDLE_PRIO slot */ rq = task_rq_lock(rq->idle, &flags); deactivate_task(rq->idle, rq); rq->idle->static_prio = IDLE_PRIO; __setscheduler(rq->idle, SCHED_NORMAL, 0); enqueue_task(rq->idle, rq, IDLE_PRIO); task_rq_unlock(rq, &flags); BUG_ON(rq->nr_running != 0); /* No need to migrate the tasks: it was best-effort if * they didn't do lock_cpu_hotplug(). Just wake up * the requestors. */ spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { migration_req_t *req; req = list_entry(rq->migration_queue.next, migration_req_t, list); BUG_ON(req->type != REQ_MOVE_TASK); list_del_init(&req->list); complete(&req->done); } spin_unlock_irq(&rq->lock); break; #endif } return NOTIFY_OK; } /* Register at highest priority so that task migration (migrate_all_tasks) * happens before everything else. */ static struct notifier_block __devinitdata migration_notifier = { .notifier_call = migration_call, .priority = 10 }; int __init migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); /* Start one for boot CPU. */ migration_call(&migration_notifier, CPU_UP_PREPARE, cpu); migration_call(&migration_notifier, CPU_ONLINE, cpu); register_cpu_notifier(&migration_notifier); return 0; } #endif /* * The 'big kernel lock' * * This spinlock is taken and released recursively by lock_kernel() * and unlock_kernel(). It is transparently dropped and reaquired * over schedule(). It is used to protect legacy code that hasn't * been migrated to a proper locking design yet. * * Don't use in new code. * * Note: spinlock debugging needs this even on !CONFIG_SMP. */ spinlock_t kernel_flag __cacheline_aligned_in_smp = SPIN_LOCK_UNLOCKED; EXPORT_SYMBOL(kernel_flag); #ifdef CONFIG_SMP /* Attach the domain 'sd' to 'cpu' as its base domain */ void cpu_attach_domain(struct sched_domain *sd, int cpu) { migration_req_t req; unsigned long flags; runqueue_t *rq = cpu_rq(cpu); int local = 1; lock_cpu_hotplug(); spin_lock_irqsave(&rq->lock, flags); if (cpu == smp_processor_id() || !cpu_online(cpu)) { rq->sd = sd; } else { init_completion(&req.done); req.type = REQ_SET_DOMAIN; req.sd = sd; list_add(&req.list, &rq->migration_queue); local = 0; } spin_unlock_irqrestore(&rq->lock, flags); if (!local) { wake_up_process(rq->migration_thread); wait_for_completion(&req.done); } unlock_cpu_hotplug(); } #ifdef ARCH_HAS_SCHED_DOMAIN extern void __init arch_init_sched_domains(void); #else static struct sched_group sched_group_cpus[NR_CPUS]; static DEFINE_PER_CPU(struct sched_domain, cpu_domains); #ifdef CONFIG_NUMA static struct sched_group sched_group_nodes[MAX_NUMNODES]; static DEFINE_PER_CPU(struct sched_domain, node_domains); static void __init arch_init_sched_domains(void) { int i; struct sched_group *first_node = NULL, *last_node = NULL; /* Set up domains */ for_each_cpu(i) { int node = cpu_to_node(i); cpumask_t nodemask = node_to_cpumask(node); struct sched_domain *node_sd = &per_cpu(node_domains, i); struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); *node_sd = SD_NODE_INIT; node_sd->span = cpu_possible_map; node_sd->groups = &sched_group_nodes[cpu_to_node(i)]; *cpu_sd = SD_CPU_INIT; cpus_and(cpu_sd->span, nodemask, cpu_possible_map); cpu_sd->groups = &sched_group_cpus[i]; cpu_sd->parent = node_sd; } /* Set up groups */ for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t tmp = node_to_cpumask(i); cpumask_t nodemask; struct sched_group *first_cpu = NULL, *last_cpu = NULL; struct sched_group *node = &sched_group_nodes[i]; int j; cpus_and(nodemask, tmp, cpu_possible_map); if (cpus_empty(nodemask)) continue; node->cpumask = nodemask; node->cpu_power = SCHED_LOAD_SCALE * cpus_weight(node->cpumask); for_each_cpu_mask(j, node->cpumask) { struct sched_group *cpu = &sched_group_cpus[j]; cpus_clear(cpu->cpumask); cpu_set(j, cpu->cpumask); cpu->cpu_power = SCHED_LOAD_SCALE; if (!first_cpu) first_cpu = cpu; if (last_cpu) last_cpu->next = cpu; last_cpu = cpu; } last_cpu->next = first_cpu; if (!first_node) first_node = node; if (last_node) last_node->next = node; last_node = node; } last_node->next = first_node; mb(); for_each_cpu(i) { struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); cpu_attach_domain(cpu_sd, i); } } #else /* !CONFIG_NUMA */ static void __init arch_init_sched_domains(void) { int i; struct sched_group *first_cpu = NULL, *last_cpu = NULL; /* Set up domains */ for_each_cpu(i) { struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); *cpu_sd = SD_CPU_INIT; cpu_sd->span = cpu_possible_map; cpu_sd->groups = &sched_group_cpus[i]; } /* Set up CPU groups */ for_each_cpu_mask(i, cpu_possible_map) { struct sched_group *cpu = &sched_group_cpus[i]; cpus_clear(cpu->cpumask); cpu_set(i, cpu->cpumask); cpu->cpu_power = SCHED_LOAD_SCALE; if (!first_cpu) first_cpu = cpu; if (last_cpu) last_cpu->next = cpu; last_cpu = cpu; } last_cpu->next = first_cpu; mb(); /* domains were modified outside the lock */ for_each_cpu(i) { struct sched_domain *cpu_sd = &per_cpu(cpu_domains, i); cpu_attach_domain(cpu_sd, i); } } #endif /* CONFIG_NUMA */ #endif /* ARCH_HAS_SCHED_DOMAIN */ #define SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG void sched_domain_debug(void) { int i; for_each_cpu(i) { runqueue_t *rq = cpu_rq(i); struct sched_domain *sd; int level = 0; sd = rq->sd; printk(KERN_DEBUG "CPU%d: %s\n", i, (cpu_online(i) ? " online" : "offline")); do { int j; char str[NR_CPUS]; struct sched_group *group = sd->groups; cpumask_t groupmask, tmp; cpumask_scnprintf(str, NR_CPUS, sd->span); cpus_clear(groupmask); printk(KERN_DEBUG); for (j = 0; j < level + 1; j++) printk(" "); printk("domain %d: span %s\n", level, str); if (!cpu_isset(i, sd->span)) printk(KERN_DEBUG "ERROR domain->span does not contain CPU%d\n", i); if (!cpu_isset(i, group->cpumask)) printk(KERN_DEBUG "ERROR domain->groups does not contain CPU%d\n", i); if (!group->cpu_power) printk(KERN_DEBUG "ERROR domain->cpu_power not set\n"); printk(KERN_DEBUG); for (j = 0; j < level + 2; j++) printk(" "); printk("groups:"); do { if (!group) { printk(" ERROR: NULL"); break; } if (!cpus_weight(group->cpumask)) printk(" ERROR empty group:"); cpus_and(tmp, groupmask, group->cpumask); if (cpus_weight(tmp) > 0) printk(" ERROR repeated CPUs:"); cpus_or(groupmask, groupmask, group->cpumask); cpumask_scnprintf(str, NR_CPUS, group->cpumask); printk(" %s", str); group = group->next; } while (group != sd->groups); printk("\n"); if (!cpus_equal(sd->span, groupmask)) printk(KERN_DEBUG "ERROR groups don't span domain->span\n"); level++; sd = sd->parent; if (sd) { cpus_and(tmp, groupmask, sd->span); if (!cpus_equal(tmp, groupmask)) printk(KERN_DEBUG "ERROR parent span is not a superset of domain->span\n"); } } while (sd); } } #else #define sched_domain_debug() {} #endif void __init sched_init_smp(void) { arch_init_sched_domains(); sched_domain_debug(); } #else void __init sched_init_smp(void) { } #endif /* CONFIG_SMP */ int in_sched_functions(unsigned long addr) { /* Linker adds these: start and end of __sched functions */ extern char __sched_text_start[], __sched_text_end[]; return addr >= (unsigned long)__sched_text_start && addr < (unsigned long)__sched_text_end; } void __init sched_init(void) { runqueue_t *rq; int i, k; #ifdef CONFIG_SMP /* Set up an initial dummy domain for early boot */ static struct sched_domain sched_domain_init; static struct sched_group sched_group_init; cpumask_t cpu_mask_all = CPU_MASK_ALL; memset(&sched_domain_init, 0, sizeof(struct sched_domain)); sched_domain_init.span = cpu_mask_all; sched_domain_init.groups = &sched_group_init; sched_domain_init.last_balance = jiffies; sched_domain_init.balance_interval = INT_MAX; /* Don't balance */ memset(&sched_group_init, 0, sizeof(struct sched_group)); sched_group_init.cpumask = cpu_mask_all; sched_group_init.next = &sched_group_init; sched_group_init.cpu_power = SCHED_LOAD_SCALE; #endif for (i = 0; i < NR_CPUS; i++) { rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->cache_ticks = 0; rq->preempted = 0; #ifdef CONFIG_SMP rq->sd = &sched_domain_init; rq->cpu_load = 0; rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); #endif atomic_set(&rq->nr_iowait, 0); for (k = 0; k <= IDLE_PRIO; k++) { rq->queues[k].prio = k; INIT_LIST_HEAD(&rq->queues[k].queue); } bitmap_zero(rq->bitmap, NUM_PRIO_SLOTS); // delimiter for bitsearch __set_bit(IDLE_PRIO, rq->bitmap); rq->current_prio_slot = rq->queues + (IDLE_PRIO - 20); rq->timestamp_last_tick = sched_clock(); rq->next_prom_due = (jiffies + get_prom_interval(rq)); rq->total_delay = 0; rq->eb_yardstick = 0; rq->eb_ticks_to_decay += time_slice_ticks; } /* * We have to do a little magic to get the first * thread right in SMP mode. */ rq = this_rq(); rq->curr = current; rq->idle = current; set_task_cpu(current, smp_processor_id()); wake_up_forked_process(current); /* * The boot idle thread does lazy MMU switching as well: */ atomic_inc(&init_mm.mm_count); enter_lazy_tlb(&init_mm, current); } #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP void __might_sleep(char *file, int line) { #if defined(in_atomic) static unsigned long prev_jiffy; /* ratelimiting */ if ((in_atomic() || irqs_disabled()) && system_state == SYSTEM_RUNNING) { if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) return; prev_jiffy = jiffies; printk(KERN_ERR "Debug: sleeping function called from invalid" " context at %s:%d\n", file, line); printk("in_atomic():%d, irqs_disabled():%d\n", in_atomic(), irqs_disabled()); dump_stack(); } #endif } EXPORT_SYMBOL(__might_sleep); #endif #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) /* * This could be a long-held lock. If another CPU holds it for a long time, * and that CPU is not asked to reschedule then *this* CPU will spin on the * lock for a long time, even if *this* CPU is asked to reschedule. * * So what we do here, in the slow (contended) path is to spin on the lock by * hand while permitting preemption. * * Called inside preempt_disable(). */ void __sched __preempt_spin_lock(spinlock_t *lock) { if (preempt_count() > 1) { _raw_spin_lock(lock); return; } do { preempt_enable(); while (spin_is_locked(lock)) cpu_relax(); preempt_disable(); } while (!_raw_spin_trylock(lock)); } EXPORT_SYMBOL(__preempt_spin_lock); void __sched __preempt_write_lock(rwlock_t *lock) { if (preempt_count() > 1) { _raw_write_lock(lock); return; } do { preempt_enable(); while (rwlock_is_locked(lock)) cpu_relax(); preempt_disable(); } while (!_raw_write_trylock(lock)); } EXPORT_SYMBOL(__preempt_write_lock); #endif /* defined(CONFIG_SMP) && defined(CONFIG_PREEMPT) */ #if defined(CONFIG_SYSCTL) /* * CPU scheduler control via /proc/sys/cpusched/xxx */ enum { CPU_SCHED_END_OF_LIST=0, CPU_SCHED_TIME_SLICE=1, CPU_SCHED_BASE_PROMOTION_INTERVAL, CPU_SCHED_MAX_IA_BONUS, CPU_SCHED_MAX_TPT_BONUS, CPU_SCHED_IA_THRESHOLD, CPU_SCHED_CPU_HOG_THRESHOLD, CPU_SCHED_LOG_AT_EXIT, CPU_SCHED_INTERACTIVE, CPU_SCHED_COMPUTE, CPU_SCHED_MODE, CPU_SCHED_INITIAL_IA_BONUS, CPU_SCHED_HOG_SUB_CYCLE_THRESHOLD }; static const unsigned int zero = 0; static const unsigned int one = 1; #define min_milli_value zero static const unsigned int max_milli_value = 1000; #define min_max_ia_bonus zero static const unsigned int max_max_ia_bonus = MAX_MAX_IA_BONUS; #define min_max_tpt_bonus zero static const unsigned int max_max_tpt_bonus = MAX_MAX_TPT_BONUS; static unsigned int time_slice_msecs = DEFAULT_TIME_SLICE_MSECS; #define min_time_slice_msecs one static const unsigned int max_time_slice_msecs = MAX_TIME_SLICE_MSECS; static unsigned int base_prom_interval_msecs = BASE_PROM_INTERVAL_MSECS; #define min_base_prom_interval_msecs one static const unsigned int max_base_prom_interval_msecs = INT_MAX; #define max_hog_sub_cycle_threshold max_base_prom_interval_msecs static int proc_time_slice_msecs(ctl_table *ctp, int write, struct file *fp, void __user *buffer, size_t *lenp) { int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); if ((res == 0) && write) time_slice_ticks = MSECS_TO_JIFFIES_MIN_1(time_slice_msecs); return res; } static int proc_base_prom_interval_msecs(ctl_table *ctp, int write, struct file *fp, void __user *buffer, size_t *lenp) { int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); if ((res == 0) && write) base_prom_interval_ticks = MSECS_TO_JIFFIES_MIN_1(base_prom_interval_msecs); return res; } static int proc_cpu_hog_threshold(ctl_table *ctp, int write, struct file *fp, void __user *buffer, size_t *lenp) { int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); if ((res == 0) && write) cpu_hog_threshold = calc_proportion(cpu_hog_threshold_ppt, 1000); return res; } static int proc_ia_threshold(ctl_table *ctp, int write, struct file *fp, void __user *buffer, size_t *lenp) { int res = proc_dointvec_minmax(ctp, write, fp, buffer, lenp); if ((res == 0) && write) ia_threshold = calc_proportion(ia_threshold_ppt, 1000); return res; } #define SCHED_MODE_BUFFER_LEN 16 static char current_sched_mode[SCHED_MODE_BUFFER_LEN] = ""; static int proc_sched_mode(ctl_table *ctp, int write, struct file *fp, void __user *buffer, size_t *lenp) { int res; strcpy(current_sched_mode, sched_mode_names[sched_mode]); res = proc_dostring(ctp, write, fp, buffer, lenp); if ((res == 0) && write) { int i; for (i = 0; sched_mode_names[i] != NULL; i++) if (strcmp(current_sched_mode, sched_mode_names[i]) == 0) break; if (sched_mode_names[i] == NULL) res = -EINVAL; else /* set the scheduling mode */ sched_mode = i; } return res; } ctl_table cpu_sched_table[] = { { .ctl_name = CPU_SCHED_TIME_SLICE, .procname = "time_slice", .data = &time_slice_msecs, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_time_slice_msecs, .extra1 = (void *)&min_time_slice_msecs, .extra2 = (void *)&max_time_slice_msecs }, { .ctl_name = CPU_SCHED_BASE_PROMOTION_INTERVAL, .procname = "base_promotion_interval", .data = &base_prom_interval_msecs, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_base_prom_interval_msecs, .extra1 = (void *)&min_base_prom_interval_msecs, .extra2 = (void *)&max_base_prom_interval_msecs }, { .ctl_name = CPU_SCHED_MAX_IA_BONUS, .procname = "max_ia_bonus", .data = &max_ia_bonus, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .extra1 = (void *)&min_max_ia_bonus, .extra2 = (void *)&max_max_ia_bonus }, { .ctl_name = CPU_SCHED_INITIAL_IA_BONUS, .procname = "initial_ia_bonus", .data = &initial_ia_bonus, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .extra1 = (void *)&min_max_ia_bonus, .extra2 = (void *)&max_max_ia_bonus }, { .ctl_name = CPU_SCHED_MAX_TPT_BONUS, .procname = "max_tpt_bonus", .data = &max_tpt_bonus, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .extra1 = (void *)&min_max_tpt_bonus, .extra2 = (void *)&max_max_tpt_bonus }, { .ctl_name = CPU_SCHED_HOG_SUB_CYCLE_THRESHOLD, .procname = "hog_sub_cycle_threshold", .data = &hog_sub_cycle_threshold, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .extra1 = (void *)&zero, .extra2 = (void *)&max_hog_sub_cycle_threshold }, { .ctl_name = CPU_SCHED_IA_THRESHOLD, .procname = "ia_threshold", .data = &ia_threshold_ppt, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_ia_threshold, .extra1 = (void *)&min_milli_value, .extra2 = (void *)&max_milli_value }, { .ctl_name = CPU_SCHED_CPU_HOG_THRESHOLD, .procname = "cpu_hog_threshold", .data = &cpu_hog_threshold_ppt, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_cpu_hog_threshold, .extra1 = (void *)&min_milli_value, .extra2 = (void *)&max_milli_value }, { .ctl_name = CPU_SCHED_LOG_AT_EXIT, .procname = "log_at_exit", .data = &log_at_exit, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .extra1 = (void *)&zero, .extra2 = (void *)&one }, { .ctl_name = CPU_SCHED_INTERACTIVE, .procname = "interactive", .data = &sched_interactive, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .extra1 = (void *)&zero, .extra2 = (void *)&one }, { .ctl_name = CPU_SCHED_COMPUTE, .procname = "compute", .data = &sched_compute, .maxlen = sizeof (unsigned int), .mode = 0644, .proc_handler = &proc_dointvec_minmax, .extra1 = (void *)&zero, .extra2 = (void *)&one }, { .ctl_name = CPU_SCHED_MODE, .procname = "mode", .data = ¤t_sched_mode, .maxlen = SCHED_MODE_BUFFER_LEN, .mode = 0644, .proc_handler = &proc_sched_mode, }, { .ctl_name = CPU_SCHED_END_OF_LIST } }; #endif