Import M205FDDU3CSL4 kernel source
Signed-off-by: SamarV-121 <samarvispute121@gmail.com>
diff --git a/kernel/.gitignore b/kernel/.gitignore
deleted file mode 100644
index b3097bd..0000000
--- a/kernel/.gitignore
+++ /dev/null
@@ -1,7 +0,0 @@
-#
-# Generated files
-#
-config_data.h
-config_data.gz
-timeconst.h
-hz.bc
diff --git a/kernel/Kconfig.freezer b/kernel/Kconfig.freezer
old mode 100644
new mode 100755
index a3bb4cb..99d7cb1
--- a/kernel/Kconfig.freezer
+++ b/kernel/Kconfig.freezer
@@ -1,2 +1,7 @@
config FREEZER
def_bool PM_SLEEP || CGROUP_FREEZER
+config OLAF_SUPPORT
+ bool "thaw frozen process when recieve sig"
+ default n
+ help
+ thaw frozen process when recieve sig. enable if samsung olaf solution exist
\ No newline at end of file
diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz
old mode 100644
new mode 100755
diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks
old mode 100644
new mode 100755
diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt
old mode 100644
new mode 100755
diff --git a/kernel/Makefile b/kernel/Makefile
old mode 100644
new mode 100755
index 53abf00..7a18edb
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -19,6 +19,17 @@
CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE)
endif
+# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip()
+# in coverage traces.
+KCOV_INSTRUMENT_softirq.o := n
+# These are called from save_stack_trace() on slub debug path,
+# and produce insane amounts of uninteresting coverage.
+KCOV_INSTRUMENT_module.o := n
+KCOV_INSTRUMENT_extable.o := n
+# Don't self-instrument.
+KCOV_INSTRUMENT_kcov.o := n
+KASAN_SANITIZE_kcov.o := n
+
# cond_syscall is currently not LTO compatible
CFLAGS_sys_ni.o = $(DISABLE_LTO)
@@ -29,6 +40,7 @@
obj-y += irq/
obj-y += rcu/
obj-y += livepatch/
+obj-$(CONFIG_RELOCATABLE_KERNEL) += kaslr.o
obj-$(CONFIG_CHECKPOINT_RESTORE) += kcmp.o
obj-$(CONFIG_FREEZER) += freezer.o
@@ -69,6 +81,7 @@
obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o
obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
obj-$(CONFIG_GCOV_KERNEL) += gcov/
+obj-$(CONFIG_KCOV) += kcov.o
obj-$(CONFIG_KPROBES) += kprobes.o
obj-$(CONFIG_KGDB) += debug/
obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
diff --git a/kernel/acct.c b/kernel/acct.c
old mode 100644
new mode 100755
diff --git a/kernel/async.c b/kernel/async.c
old mode 100644
new mode 100755
diff --git a/kernel/audit.c b/kernel/audit.c
old mode 100644
new mode 100755
index bdf0cf4..ec97a2a
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -69,6 +69,12 @@
#include "audit.h"
+// [ SEC_SELINUX_PORTING_EXYNOS
+#ifdef CONFIG_SEC_AVC_LOG
+#include <linux/sec_debug.h>
+#endif
+// ] SEC_SELINUX_PORTING_EXYNOS
+
/* No auditing will take place until audit_initialized == AUDIT_INITIALIZED.
* (Initialization happens after skb_init is called.) */
#define AUDIT_DISABLED -1
@@ -79,13 +85,15 @@
#define AUDIT_OFF 0
#define AUDIT_ON 1
#define AUDIT_LOCKED 2
-u32 audit_enabled = AUDIT_OFF;
-u32 audit_ever_enabled = !!AUDIT_OFF;
+// [ SEC_SELINUX_PORTING_COMMON
+u32 audit_enabled = AUDIT_ON;
+u32 audit_ever_enabled = !!AUDIT_ON;
+// ] SEC_SELINUX_PORTING_COMMON
EXPORT_SYMBOL_GPL(audit_enabled);
/* Default state when kernel boots without any parameters. */
-static u32 audit_default = AUDIT_OFF;
+static u32 audit_default = AUDIT_ON;
/* If auditing cannot proceed, audit_failure selects what happens. */
static u32 audit_failure = AUDIT_FAIL_PRINTK;
@@ -394,10 +402,16 @@
char *data = nlmsg_data(nlh);
if (nlh->nlmsg_type != AUDIT_EOE) {
+// [ SEC_SELINUX_PORTING_EXYNOS
+#ifdef CONFIG_SEC_AVC_LOG
+ sec_debug_avc_log("type=%d %s\n", nlh->nlmsg_type, data);
+#else
if (printk_ratelimit())
pr_notice("type=%d %s\n", nlh->nlmsg_type, data);
else
audit_log_lost("printk limit exceeded");
+#endif
+// ] SEC_SELINUX_PORTING_EXYNOS
}
audit_hold_skb(skb);
@@ -412,6 +426,10 @@
restart:
/* take a reference in case we can't send it and we want to hold it */
skb_get(skb);
+
+ /* null check to prevent kernel panic */
+ if(skb == NULL)
+ return;
err = netlink_unicast(audit_sock, skb, audit_nlk_portid, 0);
if (err < 0) {
pr_err("netlink_unicast sending to audit_pid=%d returned error: %d\n",
@@ -436,9 +454,21 @@
}
/* we might get lucky and get this in the next auditd */
audit_hold_skb(skb);
- } else
+ } else {
+// [ SEC_SELINUX_PORTING_EXYNOS
+#ifdef CONFIG_SEC_AVC_LOG
+ struct nlmsghdr *nlh = nlmsg_hdr(skb);
+ char *data = NLMSG_DATA(nlh);
+
+ if (nlh->nlmsg_type != AUDIT_EOE && nlh->nlmsg_type != AUDIT_NETFILTER_CFG) {
+ sec_debug_avc_log("%s\n", data);
+ }
+#endif
+// ] SEC_SELINUX_PORTING_EXYNOS
/* drop the extra reference if sent ok */
consume_skb(skb);
+ }
+// ] SEC_SELINUX_PORTING_EXYNOS
}
/*
@@ -492,18 +522,20 @@
{
struct sk_buff *skb;
- if (!audit_default || !audit_pid)
+// [ SEC_SELINUX_PORTING_COMMON
+ if (!audit_default || !audit_pid || !audit_sock)
return;
-
+// ] SEC_SELINUX_PORTING_COMMON
skb = skb_dequeue(&audit_skb_hold_queue);
if (likely(!skb))
return;
- while (skb && audit_pid) {
+// [ SEC_SELINUX_PORTING_COMMON
+ while (skb && audit_pid && audit_sock) {
kauditd_send_skb(skb);
skb = skb_dequeue(&audit_skb_hold_queue);
}
-
+// ] SEC_SELINUX_PORTING_COMMON
/*
* if auditd just disappeared but we
* dequeued an skb we need to drop ref
@@ -525,8 +557,10 @@
if (skb) {
if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)
wake_up(&audit_backlog_wait);
- if (audit_pid)
+// [ SEC_SELINUX_PORTING_COMMON
+ if (audit_pid && audit_sock)
kauditd_send_skb(skb);
+// ] SEC_SELINUX_PORTING_COMMON
else
audit_printk_skb(skb);
continue;
@@ -871,6 +905,12 @@
return err;
}
if (s.mask & AUDIT_STATUS_PID) {
+ /* NOTE: we are using task_tgid_vnr() below because
+ * the s.pid value is relative to the namespace
+ * of the caller; at present this doesn't matter
+ * much since you can really only run auditd
+ * from the initial pid namespace, but something
+ * to keep in mind if this changes */
int new_pid = s.pid;
if ((!new_pid) && (task_tgid_vnr(current) != audit_pid))
@@ -1890,7 +1930,7 @@
" euid=%u suid=%u fsuid=%u"
" egid=%u sgid=%u fsgid=%u tty=%s ses=%u",
task_ppid_nr(tsk),
- task_pid_nr(tsk),
+ task_tgid_nr(tsk),
from_kuid(&init_user_ns, audit_get_loginuid(tsk)),
from_kuid(&init_user_ns, cred->uid),
from_kgid(&init_user_ns, cred->gid),
diff --git a/kernel/audit.h b/kernel/audit.h
old mode 100644
new mode 100755
diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c
old mode 100644
new mode 100755
diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
old mode 100644
new mode 100755
diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c
old mode 100644
new mode 100755
diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
old mode 100644
new mode 100755
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
old mode 100644
new mode 100755
index 0fe8b33..4f164e9
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -458,7 +458,7 @@
switch (f->type) {
case AUDIT_PID:
- pid = task_pid_nr(tsk);
+ pid = task_tgid_nr(tsk);
result = audit_comparator(pid, f->op, f->val);
break;
case AUDIT_PPID:
@@ -1329,6 +1329,9 @@
/* tsk == current */
context->personality = tsk->personality;
+// [ SEC_SELINUX_PORTING_COMMON
+ if (context->major != __NR_setsockopt && context->major != 294 ) {
+// ] SEC_SELINUX_PORTING_COMMON
ab = audit_log_start(context, GFP_KERNEL, AUDIT_SYSCALL);
if (!ab)
return; /* audit_panic has been called */
@@ -1437,7 +1440,9 @@
}
audit_log_proctitle(tsk, context);
-
+// [ SEC_SELINUX_PORTING_COMMON
+ } // End of context->major != __NR_setsockopt
+// ] SEC_SELINUX_PORTING_COMMON
/* Send end of event record to help user space know we are finished */
ab = audit_log_start(context, GFP_KERNEL, AUDIT_EOE);
if (ab)
@@ -1992,7 +1997,7 @@
loginuid = from_kuid(&init_user_ns, kloginuid),
tty = audit_get_tty(current);
- audit_log_format(ab, "pid=%d uid=%u", task_pid_nr(current), uid);
+ audit_log_format(ab, "pid=%d uid=%u", task_tgid_nr(current), uid);
audit_log_task_context(ab);
audit_log_format(ab, " old-auid=%u auid=%u tty=%s old-ses=%u ses=%u res=%d",
oldloginuid, loginuid, tty ? tty_name(tty) : "(none)",
@@ -2219,7 +2224,7 @@
{
struct audit_context *context = current->audit_context;
- context->target_pid = task_pid_nr(t);
+ context->target_pid = task_tgid_nr(t);
context->target_auid = audit_get_loginuid(t);
context->target_uid = task_uid(t);
context->target_sessionid = audit_get_sessionid(t);
@@ -2244,7 +2249,7 @@
if (audit_pid && t->tgid == audit_pid) {
if (sig == SIGTERM || sig == SIGHUP || sig == SIGUSR1 || sig == SIGUSR2) {
- audit_sig_pid = task_pid_nr(tsk);
+ audit_sig_pid = task_tgid_nr(tsk);
if (uid_valid(tsk->loginuid))
audit_sig_uid = tsk->loginuid;
else
@@ -2344,7 +2349,7 @@
void __audit_log_capset(const struct cred *new, const struct cred *old)
{
struct audit_context *context = current->audit_context;
- context->capset.pid = task_pid_nr(current);
+ context->capset.pid = task_tgid_nr(current);
context->capset.cap.effective = new->cap_effective;
context->capset.cap.inheritable = new->cap_effective;
context->capset.cap.permitted = new->cap_permitted;
@@ -2376,7 +2381,7 @@
from_kgid(&init_user_ns, gid),
sessionid);
audit_log_task_context(ab);
- audit_log_format(ab, " pid=%d comm=", task_pid_nr(current));
+ audit_log_format(ab, " pid=%d comm=", task_tgid_nr(current));
audit_log_untrustedstring(ab, get_task_comm(comm, current));
audit_log_d_path_exe(ab, current->mm);
}
diff --git a/kernel/backtracetest.c b/kernel/backtracetest.c
old mode 100644
new mode 100755
diff --git a/kernel/bounds.c b/kernel/bounds.c
old mode 100644
new mode 100755
diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
old mode 100644
new mode 100755
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
old mode 100644
new mode 100755
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c
old mode 100644
new mode 100755
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
old mode 100644
new mode 100755
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
old mode 100644
new mode 100755
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c
old mode 100644
new mode 100755
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
old mode 100644
new mode 100755
diff --git a/kernel/capability.c b/kernel/capability.c
old mode 100644
new mode 100755
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
old mode 100644
new mode 100755
index 5299618..9f193a2
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -784,6 +784,8 @@
static void put_css_set(struct css_set *cset)
{
+ unsigned long flags;
+
/*
* Ensure that the refcount doesn't hit zero while any readers
* can see it. Similar to atomic_dec_and_lock(), but for an
@@ -792,9 +794,9 @@
if (atomic_add_unless(&cset->refcount, -1, 1))
return;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irqsave(&css_set_lock, flags);
put_css_set_locked(cset);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irqrestore(&css_set_lock, flags);
}
/*
@@ -1017,11 +1019,11 @@
/* First see if we already have a cgroup group that matches
* the desired set */
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
cset = find_existing_css_set(old_cset, cgrp, template);
if (cset)
get_css_set(cset);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
if (cset)
return cset;
@@ -1049,7 +1051,7 @@
* find_existing_css_set() */
memcpy(cset->subsys, template, sizeof(cset->subsys));
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
/* Add reference counts and links from the new css_set. */
list_for_each_entry(link, &old_cset->cgrp_links, cgrp_link) {
struct cgroup *c = link->cgrp;
@@ -1075,7 +1077,7 @@
css_get(css);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return cset;
}
@@ -1139,7 +1141,7 @@
* Release all the links from cset_links to this hierarchy's
* root cgroup
*/
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry_safe(link, tmp_link, &cgrp->cset_links, cset_link) {
list_del(&link->cset_link);
@@ -1147,7 +1149,7 @@
kfree(link);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
if (!list_empty(&root->root_list)) {
list_del(&root->root_list);
@@ -1551,11 +1553,11 @@
ss->root = dst_root;
css->cgroup = dcgrp;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist)
list_move_tail(&cset->e_cset_node[ss->id],
&dcgrp->e_csets[ss->id]);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
src_root->subsys_mask &= ~(1 << ssid);
scgrp->subtree_control &= ~(1 << ssid);
@@ -1832,7 +1834,7 @@
{
struct task_struct *p, *g;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
if (use_task_css_set_links)
goto out_unlock;
@@ -1857,8 +1859,12 @@
* entry won't be deleted though the process has exited.
* Do it while holding siglock so that we don't end up
* racing against cgroup_exit().
+ *
+ * Interrupts were already disabled while acquiring
+ * the css_set_lock, so we do not need to disable it
+ * again when acquiring the sighand->siglock here.
*/
- spin_lock_irq(&p->sighand->siglock);
+ spin_lock(&p->sighand->siglock);
if (!(p->flags & PF_EXITING)) {
struct css_set *cset = task_css_set(p);
@@ -1867,11 +1873,11 @@
list_add_tail(&p->cg_list, &cset->tasks);
get_css_set(cset);
}
- spin_unlock_irq(&p->sighand->siglock);
+ spin_unlock(&p->sighand->siglock);
} while_each_thread(g, p);
read_unlock(&tasklist_lock);
out_unlock:
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
static void init_cgroup_housekeeping(struct cgroup *cgrp)
@@ -1976,13 +1982,13 @@
* Link the root cgroup in this hierarchy into all the css_set
* objects.
*/
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
hash_for_each(css_set_table, i, cset, hlist) {
link_css_set(&tmp_links, cset, root_cgrp);
if (css_set_populated(cset))
cgroup_update_populated(root_cgrp, true);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
BUG_ON(!list_empty(&root_cgrp->self.children));
BUG_ON(atomic_read(&root->nr_cgrps) != 1);
@@ -2215,7 +2221,7 @@
char *path = NULL;
mutex_lock(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
root = idr_get_next(&cgroup_hierarchy_idr, &hierarchy_id);
@@ -2228,7 +2234,7 @@
path = buf;
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
return path;
}
@@ -2403,7 +2409,7 @@
* the new cgroup. There are no failure cases after here, so this
* is the commit point.
*/
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(cset, &tset->src_csets, mg_node) {
list_for_each_entry_safe(task, tmp_task, &cset->mg_tasks, cg_list) {
struct css_set *from_cset = task_css_set(task);
@@ -2414,7 +2420,7 @@
put_css_set_locked(from_cset);
}
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/*
* Migration is committed, all target tasks are now on dst_csets.
@@ -2443,13 +2449,13 @@
}
}
out_release_tset:
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_splice_init(&tset->dst_csets, &tset->src_csets);
list_for_each_entry_safe(cset, tmp_cset, &tset->src_csets, mg_node) {
list_splice_tail_init(&cset->mg_tasks, &cset->tasks);
list_del_init(&cset->mg_node);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return ret;
}
@@ -2466,14 +2472,14 @@
lockdep_assert_held(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) {
cset->mg_src_cgrp = NULL;
cset->mg_dst_cset = NULL;
list_del_init(&cset->mg_preload_node);
put_css_set_locked(cset);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
/**
@@ -2623,7 +2629,7 @@
* already PF_EXITING could be freed from underneath us unless we
* take an rcu_read_lock.
*/
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader;
do {
@@ -2632,7 +2638,7 @@
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return cgroup_taskset_migrate(&tset, cgrp);
}
@@ -2653,7 +2659,7 @@
int ret;
/* look up all src csets */
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
rcu_read_lock();
task = leader;
do {
@@ -2663,7 +2669,7 @@
break;
} while_each_thread(leader, task);
rcu_read_unlock();
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/* prepare dst csets and commit */
ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets);
@@ -2688,7 +2694,8 @@
*/
if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
!uid_eq(cred->euid, tcred->uid) &&
- !uid_eq(cred->euid, tcred->suid))
+ !uid_eq(cred->euid, tcred->suid) &&
+ !ns_capable(tcred->user_ns, CAP_SYS_NICE))
ret = -EACCES;
if (!ret && cgroup_on_dfl(dst_cgrp)) {
@@ -2696,9 +2703,9 @@
struct cgroup *cgrp;
struct inode *inode;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
while (!cgroup_is_descendant(dst_cgrp, cgrp))
cgrp = cgroup_parent(cgrp);
@@ -2800,9 +2807,9 @@
if (root == &cgrp_dfl_root)
continue;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
from_cgrp = task_cgroup_from_root(from, root);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
retval = cgroup_attach_task(from_cgrp, tsk, false);
if (retval)
@@ -2927,7 +2934,7 @@
percpu_down_write(&cgroup_threadgroup_rwsem);
/* look up all csses currently attached to @cgrp's subtree */
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) {
struct cgrp_cset_link *link;
@@ -2939,14 +2946,14 @@
cgroup_migrate_add_src(link->cset, cgrp,
&preloaded_csets);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/* NULL dst indicates self on default hierarchy */
ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets);
if (ret)
goto out_finish;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(src_cset, &preloaded_csets, mg_preload_node) {
struct task_struct *task, *ntask;
@@ -2958,7 +2965,7 @@
list_for_each_entry_safe(task, ntask, &src_cset->tasks, cg_list)
cgroup_taskset_add(task, &tset);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
ret = cgroup_taskset_migrate(&tset, cgrp);
out_finish:
@@ -3641,10 +3648,10 @@
int count = 0;
struct cgrp_cset_link *link;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
count += atomic_read(&link->cset->refcount);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return count;
}
@@ -3982,7 +3989,7 @@
memset(it, 0, sizeof(*it));
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
it->ss = css->ss;
@@ -3995,7 +4002,7 @@
css_task_iter_advance_css_set(it);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
/**
@@ -4013,7 +4020,7 @@
it->cur_task = NULL;
}
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
if (it->task_pos) {
it->cur_task = list_entry(it->task_pos, struct task_struct,
@@ -4022,7 +4029,7 @@
css_task_iter_advance(it);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return it->cur_task;
}
@@ -4036,10 +4043,10 @@
void css_task_iter_end(struct css_task_iter *it)
{
if (it->cur_cset) {
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_del(&it->iters_node);
put_css_set_locked(it->cur_cset);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
if (it->cur_task)
@@ -4068,10 +4075,10 @@
mutex_lock(&cgroup_mutex);
/* all tasks in @from are being moved, all csets are source */
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &from->cset_links, cset_link)
cgroup_migrate_add_src(link->cset, to, &preloaded_csets);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
ret = cgroup_migrate_prepare_dst(to, &preloaded_csets);
if (ret)
@@ -4353,6 +4360,9 @@
while ((tsk = css_task_iter_next(&it))) {
if (unlikely(n == length))
break;
+ /* avoid the use-after-free for task */
+ if (unlikely(tsk->state == TASK_DEAD))
+ continue;
/* get tgid or pid for procs or tasks file respectively */
if (type == CGROUP_FILE_PROCS)
pid = task_tgid_vnr(tsk);
@@ -5180,10 +5190,10 @@
*/
cgrp->self.flags &= ~CSS_ONLINE;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &cgrp->cset_links, cset_link)
link->cset->dead = true;
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
/* initiate massacre of all css's */
for_each_css(css, ssid, cgrp)
@@ -5333,6 +5343,12 @@
BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files));
BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files));
+ /*
+ * The latency of the synchronize_sched() is too high for cgroups,
+ * avoid it at the cost of forcing all readers into the slow path.
+ */
+ rcu_sync_enter_start(&cgroup_threadgroup_rwsem.rss);
+
mutex_lock(&cgroup_mutex);
/* Add init_css_set to the hash table */
@@ -5436,7 +5452,7 @@
goto out;
mutex_lock(&cgroup_mutex);
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
for_each_root(root) {
struct cgroup_subsys *ss;
@@ -5488,7 +5504,7 @@
retval = 0;
out_unlock:
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
mutex_unlock(&cgroup_mutex);
kfree(buf);
out:
@@ -5649,13 +5665,13 @@
if (use_task_css_set_links) {
struct css_set *cset;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
cset = task_css_set(current);
if (list_empty(&child->cg_list)) {
get_css_set(cset);
css_set_move_task(child, NULL, cset, false);
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
}
/*
@@ -5699,9 +5715,9 @@
cset = task_css_set(tsk);
if (!list_empty(&tsk->cg_list)) {
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
css_set_move_task(tsk, cset, NULL, false);
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
} else {
get_css_set(cset);
}
@@ -5914,7 +5930,7 @@
if (!name_buf)
return -ENOMEM;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
rcu_read_lock();
cset = rcu_dereference(current->cgroups);
list_for_each_entry(link, &cset->cgrp_links, cgrp_link) {
@@ -5925,7 +5941,7 @@
c->root->hierarchy_id, name_buf);
}
rcu_read_unlock();
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
kfree(name_buf);
return 0;
}
@@ -5936,13 +5952,13 @@
struct cgroup_subsys_state *css = seq_css(seq);
struct cgrp_cset_link *link;
- spin_lock_bh(&css_set_lock);
+ spin_lock_irq(&css_set_lock);
list_for_each_entry(link, &css->cgroup->cset_links, cset_link) {
struct css_set *cset = link->cset;
struct task_struct *task;
int count = 0;
- seq_printf(seq, "css_set %p\n", cset);
+ seq_printf(seq, "css_set %pK\n", cset);
list_for_each_entry(task, &cset->tasks, cg_list) {
if (count++ > MAX_TASKS_SHOWN_PER_CSS)
@@ -5959,7 +5975,7 @@
overflow:
seq_puts(seq, " ...\n");
}
- spin_unlock_bh(&css_set_lock);
+ spin_unlock_irq(&css_set_lock);
return 0;
}
diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c
old mode 100644
new mode 100755
index 2d3df82..1314c09
--- a/kernel/cgroup_freezer.c
+++ b/kernel/cgroup_freezer.c
@@ -450,6 +450,40 @@
return (bool)(freezer->state & CGROUP_FREEZING_PARENT);
}
+#ifdef CONFIG_SAMSUNG_FREECESS
+/**
+ * Check if the task is allowed to be added to the freezer group
+ * only the admin can add the task to the freezer group.
+ */
+static int freezer_can_attach(struct cgroup_taskset *tset)
+{
+ const struct cred *cred = current_cred(), *tcred;
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_for_each(task, css, tset) {
+ tcred = __task_cred(task);
+
+ //Only system process and root have the permission.
+ if ((current != task) && !(cred->euid.val == 1000 || capable(CAP_SYS_ADMIN))) {
+ pr_err("Permission problem\n");
+ return -EACCES;
+ }
+ }
+
+ return 0;
+}
+
+/**
+ * Cancel the attach action when it failed. It's usually used to restore the attach action.
+ * But freezer attach just sends the signal, it will always success.
+ * So, it doesn't need to restore any action.
+ */
+static void freezer_cancel_attach(struct cgroup_taskset *tset)
+{
+}
+#endif
+
static struct cftype files[] = {
{
.name = "state",
@@ -478,4 +512,8 @@
.attach = freezer_attach,
.fork = freezer_fork,
.legacy_cftypes = files,
+#ifdef CONFIG_SAMSUNG_FREECESS
+ .can_attach = freezer_can_attach,
+ .cancel_attach = freezer_cancel_attach,
+#endif
};
diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c
old mode 100644
new mode 100755
diff --git a/kernel/compat.c b/kernel/compat.c
old mode 100644
new mode 100755
diff --git a/kernel/configs.c b/kernel/configs.c
old mode 100644
new mode 100755
diff --git a/kernel/configs/android-base.config b/kernel/configs/android-base.config
new file mode 100755
index 0000000..d708290
--- /dev/null
+++ b/kernel/configs/android-base.config
@@ -0,0 +1,160 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_DEVKMEM is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_FHANDLE is not set
+# CONFIG_INET_LRO is not set
+# CONFIG_NFSD is not set
+# CONFIG_NFS_FS is not set
+# CONFIG_OABI_COMPAT is not set
+# CONFIG_SYSVIPC is not set
+# CONFIG_USELIB is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_ASHMEM=y
+CONFIG_AUDIT=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_BPF=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_DEFAULT_SECURITY_SELINUX=y
+CONFIG_EMBEDDED=y
+CONFIG_FB=y
+CONFIG_HARDENED_USERCOPY=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_INET_ESP=y
+CONFIG_INET_XFRM_MODE_TUNNEL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IPV6=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_NAT=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_TARGET_MASQUERADE=y
+CONFIG_IP_NF_TARGET_NETMAP=y
+CONFIG_IP_NF_TARGET_REDIRECT=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_MODULES=y
+CONFIG_MODULE_UNLOAD=y
+CONFIG_MODVERSIONS=y
+CONFIG_NET=y
+CONFIG_NETDEVICES=y
+CONFIG_NETFILTER=y
+CONFIG_NETFILTER_TPROXY=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_KEY=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_NAT=y
+CONFIG_NO_HZ=y
+CONFIG_PACKET=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_PREEMPT=y
+CONFIG_QUOTA=y
+CONFIG_RANDOMIZE_BASE=y
+CONFIG_RTC_CLASS=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SECCOMP=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_STAGING=y
+CONFIG_SWP_EMULATION=y
+CONFIG_SYNC=y
+CONFIG_TUN=y
+CONFIG_UNIX=y
+CONFIG_USB_GADGET=y
+CONFIG_USB_CONFIGFS=y
+CONFIG_USB_CONFIGFS_F_FS=y
+CONFIG_USB_CONFIGFS_F_MIDI=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_XFRM_USER=y
diff --git a/kernel/configs/android-recommended.config b/kernel/configs/android-recommended.config
new file mode 100755
index 0000000..297756b
--- /dev/null
+++ b/kernel/configs/android-recommended.config
@@ -0,0 +1,125 @@
+# KEEP ALPHABETICALLY SORTED
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+# CONFIG_INPUT_MOUSE is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_NF_CONNTRACK_SIP is not set
+# CONFIG_PM_WAKELOCKS_GC is not set
+# CONFIG_VT is not set
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_COMPACTION=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_FUSE_FS=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HIDRAW=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_GREENASIA=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_GPIO=y
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_ION=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_KSM=y
+CONFIG_LOGIG940_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGITECH_FF=y
+CONFIG_MD=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_MSDOS_FS=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_PANTHERLORD_FF=y
+CONFIG_PERF_EVENTS=y
+CONFIG_PM_DEBUG=y
+CONFIG_PM_RUNTIME=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+CONFIG_POWER_SUPPLY=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_SCHEDSTATS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_SND=y
+CONFIG_SOUND=y
+CONFIG_SUSPEND_TIME=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_TASK_XACCT=y
+CONFIG_TIMER_STATS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_UHID=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_USBNET=y
+CONFIG_VFAT_FS=y
diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config
old mode 100644
new mode 100755
diff --git a/kernel/configs/xen.config b/kernel/configs/xen.config
old mode 100644
new mode 100755
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
old mode 100644
new mode 100755
diff --git a/kernel/cpu.c b/kernel/cpu.c
old mode 100644
new mode 100755
index 40d20bf..bddf281
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -22,8 +22,11 @@
#include <linux/lockdep.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/cpuidle.h>
#include <trace/events/power.h>
+#include <trace/events/sched.h>
+
#include "smpboot.h"
#ifdef CONFIG_SMP
@@ -50,6 +53,7 @@
EXPORT_SYMBOL(cpu_notifier_register_done);
static RAW_NOTIFIER_HEAD(cpu_chain);
+static RAW_NOTIFIER_HEAD(cpus_chain);
/* If set, cpu_up and cpu_down will return -EBUSY and do nothing.
* Should always be manipulated under cpu_add_remove_lock
@@ -183,10 +187,17 @@
}
EXPORT_SYMBOL_GPL(cpu_hotplug_disable);
+static void __cpu_hotplug_enable(void)
+{
+ if (WARN_ONCE(!cpu_hotplug_disabled, "Unbalanced cpu hotplug enable\n"))
+ return;
+ cpu_hotplug_disabled--;
+}
+
void cpu_hotplug_enable(void)
{
cpu_maps_update_begin();
- WARN_ON(--cpu_hotplug_disabled < 0);
+ __cpu_hotplug_enable();
cpu_maps_update_done();
}
EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
@@ -226,6 +237,37 @@
EXPORT_SYMBOL(register_cpu_notifier);
EXPORT_SYMBOL(__register_cpu_notifier);
+int register_cpus_notifier(struct notifier_block *nb)
+{
+ int ret;
+ cpu_maps_update_begin();
+ ret = raw_notifier_chain_register(&cpus_chain, nb);
+ cpu_maps_update_done();
+ return ret;
+}
+
+static int __cpus_notify(unsigned long val, void *v, int nr_to_call,
+ int *nr_calls)
+{
+ int ret;
+
+ ret = __raw_notifier_call_chain(&cpus_chain, val, v, nr_to_call,
+ nr_calls);
+
+ return notifier_to_errno(ret);
+}
+
+static int cpus_notify(unsigned long val, void *v)
+{
+ return __cpus_notify(val, v, -1, NULL);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+static void cpu_notify_nofail(unsigned long val, void *v)
+{
+ BUG_ON(cpu_notify(val, v));
+}
+
void unregister_cpu_notifier(struct notifier_block *nb)
{
cpu_maps_update_begin();
@@ -240,11 +282,19 @@
}
EXPORT_SYMBOL(__unregister_cpu_notifier);
-#ifdef CONFIG_HOTPLUG_CPU
-static void cpu_notify_nofail(unsigned long val, void *v)
+static void cpus_notify_nofail(unsigned long val, void *v)
{
- BUG_ON(cpu_notify(val, v));
+ BUG_ON(cpus_notify(val, v));
}
+EXPORT_SYMBOL(register_cpus_notifier);
+
+void unregister_cpus_notifier(struct notifier_block *nb)
+{
+ cpu_maps_update_begin();
+ raw_notifier_chain_unregister(&cpus_chain, nb);
+ cpu_maps_update_done();
+}
+EXPORT_SYMBOL(unregister_cpus_notifier);
/**
* clear_tasks_mm_cpumask - Safely clear tasks' mm_cpumask for a CPU
@@ -320,18 +370,22 @@
static int take_cpu_down(void *_param)
{
struct take_cpu_down_param *param = _param;
+ void *hcpu = param->hcpu;
int err;
+ if ((long)hcpu == NR_CPUS)
+ hcpu = (void *)(long)smp_processor_id();
+
/* Ensure this CPU doesn't handle any more interrupts. */
err = __cpu_disable();
if (err < 0)
return err;
- cpu_notify(CPU_DYING | param->mod, param->hcpu);
+ cpu_notify(CPU_DYING | param->mod, hcpu);
/* Give up timekeeping duties */
tick_handover_do_timer();
/* Park the stopper thread */
- stop_machine_park((long)param->hcpu);
+ stop_machine_park((long)hcpu);
return 0;
}
@@ -354,29 +408,19 @@
cpu_hotplug_begin();
+ cpuidle_disable_device(per_cpu(cpuidle_devices, cpu));
+
err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls);
if (err) {
nr_calls--;
__cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL);
+ cpuidle_enable_device(per_cpu(cpuidle_devices, cpu));
pr_warn("%s: attempt to take down CPU %u failed\n",
__func__, cpu);
goto out_release;
}
- /*
- * By now we've cleared cpu_active_mask, wait for all preempt-disabled
- * and RCU users of this state to go away such that all new such users
- * will observe it.
- *
- * For CONFIG_PREEMPT we have preemptible RCU and its sync_rcu() might
- * not imply sync_sched(), so wait for both.
- *
- * Do sync before park smpboot threads to take care the rcu boost case.
- */
- if (IS_ENABLED(CONFIG_PREEMPT))
- synchronize_rcu_mult(call_rcu, call_rcu_sched);
- else
- synchronize_rcu();
+ cpu_notify_nofail(CPU_DOWN_LATE_PREPARE | mod, 0);
smpboot_park_threads(cpu);
@@ -386,9 +430,6 @@
*/
irq_lock_sparse();
- /*
- * So now all preempt/rcu users must observe !cpu_active().
- */
err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu));
if (err) {
/* CPU didn't die: tell everyone. Can't complain. */
@@ -417,6 +458,11 @@
/* This actually kills the CPU. */
__cpu_die(cpu);
+#ifdef CONFIG_HMP_SCHED
+ if (cpumask_test_cpu(cpu, &hmp_fast_cpu_mask))
+ cpus_notify_nofail(CPUS_DOWN_COMPLETE, (void *)cpu_online_mask);
+#endif
+
/* CPU is completely dead: tell everyone. Too late to complain. */
tick_cleanup_dead_cpu(cpu);
cpu_notify_nofail(CPU_DEAD | mod, hcpu);
@@ -425,6 +471,7 @@
out_release:
cpu_hotplug_done();
+ trace_sched_cpu_hotplug(cpu, err, 0);
if (!err)
cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu);
return err;
@@ -448,6 +495,149 @@
return err;
}
EXPORT_SYMBOL(cpu_down);
+
+int cpus_down(const struct cpumask *cpus)
+{
+ cpumask_t dest_cpus;
+ cpumask_t prepared_cpus;
+ int err = 0, cpu;
+ int nr_calls[8] = {0};
+ struct take_cpu_down_param tcd_param = {
+ .mod = 0,
+ .hcpu = (void *)NR_CPUS,
+ };
+
+ cpu_maps_update_begin();
+ cpu_hotplug_begin();
+
+ cpumask_and(&dest_cpus, cpus, cpu_online_mask);
+
+ if (cpu_hotplug_disabled || !cpumask_weight(&dest_cpus)
+ || num_online_cpus() <= cpumask_weight(&dest_cpus)) {
+ err = -EBUSY;
+ goto out;
+ }
+
+ cpumask_clear(&prepared_cpus);
+
+ for_each_cpu(cpu, &dest_cpus) {
+ void *hcpu = (void *)(long)cpu;
+ cpumask_set_cpu(cpu, &prepared_cpus);
+
+ cpuidle_disable_device(per_cpu(cpuidle_devices, cpu));
+
+ err = __cpu_notify(CPU_DOWN_PREPARE, hcpu, -1, &nr_calls[cpu]);
+ if (err) {
+ nr_calls[cpu]--;
+ goto err_down_prepare;
+ }
+ }
+
+ cpu_notify_nofail(CPU_DOWN_LATE_PREPARE, 0);
+
+ for_each_cpu(cpu, &dest_cpus) {
+ smpboot_park_threads(cpu);
+ }
+
+ /*
+ * Prevent irq alloc/free while the dying cpu reorganizes the
+ * interrupt affinities.
+ */
+ irq_lock_sparse();
+
+ err = stop_machine(take_cpu_down, &tcd_param, &dest_cpus);
+ if (err)
+ goto err_stop_machine;
+
+ for_each_cpu(cpu, &dest_cpus) {
+ BUG_ON(cpu_online(cpu));
+ /*
+ * The migration_call() CPU_DYING callback will have removed all
+ * runnable tasks from the cpu, there's only the idle task left now
+ * that the migration thread is done doing the stop_machine thing.
+ *
+ * Wait for the stop thread to go away.
+ */
+ while (!per_cpu(cpu_dead_idle, cpu))
+ cpu_relax();
+ smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
+ per_cpu(cpu_dead_idle, cpu) = false;
+ }
+
+ /* Interrupts are moved away from the dying cpu, reenable alloc/free */
+ irq_unlock_sparse();
+
+ for_each_cpu(cpu, &dest_cpus) {
+ hotplug_cpu__broadcast_tick_pull(cpu);
+ /* This actually kills the CPU. */
+ __cpu_die(cpu);
+ }
+
+ cpus_notify_nofail(CPUS_DOWN_COMPLETE, (void *)cpu_online_mask);
+
+ /* CPU is completely dead: tell everyone. Too late to complain. */
+ for_each_cpu(cpu, &dest_cpus) {
+ void *hcpu = (void *)(long)cpu;
+
+ tick_cleanup_dead_cpu(cpu);
+ cpu_notify_nofail(CPU_DEAD, hcpu);
+
+ check_for_tasks(cpu);
+ }
+
+ cpu_hotplug_done();
+
+ for_each_cpu(cpu, &dest_cpus) {
+ void *hcpu = (void *)(long)cpu;
+
+ trace_sched_cpu_hotplug(cpu, err, 0);
+ cpu_notify_nofail(CPU_POST_DEAD, hcpu);
+ }
+
+ cpu_maps_update_done();
+
+ return 0;
+
+err_stop_machine:
+ for_each_cpu(cpu, &dest_cpus) {
+ void *hcpu = (void *)(long)cpu;
+ smpboot_unpark_threads(cpu);
+ cpu_notify_nofail(CPU_DOWN_FAILED, hcpu);
+ }
+ goto out;
+
+err_down_prepare:
+ for_each_cpu(cpu, &prepared_cpus) {
+ void *hcpu = (void *)(long)cpu;
+ cpuidle_enable_device(per_cpu(cpuidle_devices, cpu));
+ __cpu_notify(CPU_DOWN_FAILED, hcpu, nr_calls[cpu], NULL);
+ printk("%s: attempt to take down CPU %u failed\n",
+ __func__, cpu);
+ }
+
+out:
+ cpu_hotplug_done();
+ cpu_maps_update_done();
+
+ return err;
+}
+
+int cpus_up(const struct cpumask *cpus)
+{
+ int err = 0;
+ unsigned int cpu = 0;
+ cpumask_t dest_cpus;
+
+ cpumask_andnot(&dest_cpus, cpus, cpu_online_mask);
+ for_each_cpu(cpu, &dest_cpus) {
+ err = cpu_up(cpu);
+ if (err)
+ goto out;
+ }
+out:
+ return err;
+}
+EXPORT_SYMBOL(cpus_up);
#endif /*CONFIG_HOTPLUG_CPU*/
/*
@@ -524,12 +714,14 @@
/* Now call notifier in preparation. */
cpu_notify(CPU_ONLINE | mod, hcpu);
+ cpuidle_enable_device(per_cpu(cpuidle_devices, cpu));
out_notify:
if (ret != 0)
__cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL);
out:
cpu_hotplug_done();
+ trace_sched_cpu_hotplug(cpu, ret, 1);
return ret;
}
@@ -537,6 +729,9 @@
int cpu_up(unsigned int cpu)
{
int err = 0;
+#ifdef CONFIG_SCHED_HMP
+ cpumask_t dest_cpus;
+#endif
if (!cpu_possible(cpu)) {
pr_err("can't online cpu %d because it is not configured as may-hotadd at boot time\n",
@@ -547,6 +742,11 @@
return -EINVAL;
}
+ if (!cpumask_test_cpu(cpu, &early_cpu_mask)) {
+ dump_stack();
+ return -EINVAL;
+ }
+
err = try_online_node(cpu_to_node(cpu));
if (err)
return err;
@@ -558,6 +758,16 @@
goto out;
}
+#ifdef CONFIG_SCHED_HMP
+ if (cpumask_test_cpu(cpu, &hmp_fast_cpu_mask)) {
+ cpumask_or(&dest_cpus, cpumask_of(cpu), cpu_online_mask);
+
+ err = cpus_notify(CPUS_UP_PREPARE, (void *)&dest_cpus);
+ if (err)
+ goto out;
+ }
+#endif
+
err = _cpu_up(cpu, 0);
out:
@@ -623,10 +833,11 @@
void enable_nonboot_cpus(void)
{
int cpu, error;
+ struct device *cpu_device;
/* Allow everyone to use the CPU hotplug again */
cpu_maps_update_begin();
- WARN_ON(--cpu_hotplug_disabled < 0);
+ __cpu_hotplug_enable();
if (cpumask_empty(frozen_cpus))
goto out;
@@ -640,6 +851,12 @@
trace_suspend_resume(TPS("CPU_ON"), cpu, false);
if (!error) {
pr_info("CPU%d is up\n", cpu);
+ cpu_device = get_cpu_device(cpu);
+ if (!cpu_device)
+ pr_err("%s: failed to get cpu%d device\n",
+ __func__, cpu);
+ else
+ kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE);
continue;
}
pr_warn("Error taking CPU%d up: %d\n", cpu, error);
@@ -827,3 +1044,23 @@
{
cpumask_copy(to_cpumask(cpu_online_bits), src);
}
+
+static ATOMIC_NOTIFIER_HEAD(idle_notifier);
+
+void idle_notifier_register(struct notifier_block *n)
+{
+ atomic_notifier_chain_register(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_register);
+
+void idle_notifier_unregister(struct notifier_block *n)
+{
+ atomic_notifier_chain_unregister(&idle_notifier, n);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_unregister);
+
+void idle_notifier_call_chain(unsigned long val)
+{
+ atomic_notifier_call_chain(&idle_notifier, val, NULL);
+}
+EXPORT_SYMBOL_GPL(idle_notifier_call_chain);
diff --git a/kernel/cpu_pm.c b/kernel/cpu_pm.c
old mode 100644
new mode 100755
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
old mode 100644
new mode 100755
index dd3ae6e..e1ae443
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -99,6 +99,7 @@
/* user-configured CPUs and Memory Nodes allow to tasks */
cpumask_var_t cpus_allowed;
+ cpumask_var_t cpus_requested;
nodemask_t mems_allowed;
/* effective CPUs and Memory Nodes allow to tasks */
@@ -171,6 +172,7 @@
CS_SCHED_LOAD_BALANCE,
CS_SPREAD_PAGE,
CS_SPREAD_SLAB,
+ CS_SELECTIVE_BOOST,
} cpuset_flagbits_t;
/* convenient tests for these bits */
@@ -219,6 +221,11 @@
(1 << CS_MEM_EXCLUSIVE)),
};
+static inline int is_selective_boost_enabled(const struct cpuset *cs)
+{
+ return test_bit(CS_SELECTIVE_BOOST, &cs->flags);
+}
+
/**
* cpuset_for_each_child - traverse online children of a cpuset
* @child_cs: loop cursor pointing to the current child
@@ -322,6 +329,20 @@
.mount = cpuset_mount,
};
+int cpuset_task_is_boosted(struct task_struct *p)
+{
+ struct cpuset *cpuset_for_task;
+ int ret = 0;
+
+ rcu_read_lock();
+ cpuset_for_task = task_cs(p);
+ ret = is_selective_boost_enabled(cpuset_for_task);
+ rcu_read_unlock();
+
+ return ret;
+}
+EXPORT_SYMBOL(cpuset_task_is_boosted);
+
/*
* Return in pmask the portion of a cpusets's cpus_allowed that
* are online. If none are online, walk up the cpuset hierarchy
@@ -398,7 +419,7 @@
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
{
- return cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+ return cpumask_subset(p->cpus_requested, q->cpus_requested) &&
nodes_subset(p->mems_allowed, q->mems_allowed) &&
is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -498,7 +519,7 @@
cpuset_for_each_child(c, css, par) {
if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
c != cur &&
- cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+ cpumask_intersects(trial->cpus_requested, c->cpus_requested))
goto out;
if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
c != cur &&
@@ -957,17 +978,18 @@
if (!*buf) {
cpumask_clear(trialcs->cpus_allowed);
} else {
- retval = cpulist_parse(buf, trialcs->cpus_allowed);
+ retval = cpulist_parse(buf, trialcs->cpus_requested);
if (retval < 0)
return retval;
- if (!cpumask_subset(trialcs->cpus_allowed,
- top_cpuset.cpus_allowed))
+ if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
return -EINVAL;
+
+ cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
}
/* Nothing to do if the cpus didn't change */
- if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+ if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
return 0;
retval = validate_change(cs, trialcs);
@@ -976,6 +998,7 @@
spin_lock_irq(&callback_lock);
cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
spin_unlock_irq(&callback_lock);
/* use trialcs->cpus_allowed as a temp variable */
@@ -1502,6 +1525,28 @@
return ret;
}
+static int cpuset_allow_attach(struct cgroup_taskset *tset)
+{
+ const struct cred *cred = current_cred(), *tcred;
+ struct task_struct *task;
+ struct cgroup_subsys_state *css;
+ struct cpuset *cs;
+
+ cgroup_taskset_first(tset, &css);
+ cs = css_cs(css);
+
+ cgroup_taskset_for_each(task, css, tset) {
+ tcred = __task_cred(task);
+
+ if ((current != task) && !capable(CAP_SYS_ADMIN) &&
+ !uid_eq(cred->euid, tcred->uid) &&
+ !uid_eq(cred->euid, tcred->suid))
+ return -EACCES;
+ }
+
+ return 0;
+}
+
static void cpuset_cancel_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
@@ -1609,6 +1654,7 @@
FILE_MEMORY_PRESSURE,
FILE_SPREAD_PAGE,
FILE_SPREAD_SLAB,
+ FILE_SELECTIVE_BOOST,
} cpuset_filetype_t;
static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
@@ -1649,6 +1695,9 @@
case FILE_SPREAD_SLAB:
retval = update_flag(CS_SPREAD_SLAB, cs, val);
break;
+ case FILE_SELECTIVE_BOOST:
+ retval = update_flag(CS_SELECTIVE_BOOST, cs, val);
+ break;
default:
retval = -EINVAL;
break;
@@ -1766,7 +1815,7 @@
switch (type) {
case FILE_CPULIST:
- seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+ seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
break;
case FILE_MEMLIST:
seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
@@ -1808,6 +1857,8 @@
return is_spread_page(cs);
case FILE_SPREAD_SLAB:
return is_spread_slab(cs);
+ case FILE_SELECTIVE_BOOST:
+ return is_selective_boost_enabled(cs);
default:
BUG();
}
@@ -1935,6 +1986,13 @@
.private = FILE_MEMORY_PRESSURE_ENABLED,
},
+ {
+ .name = "selective_boost",
+ .read_u64 = cpuset_read_u64,
+ .write_u64 = cpuset_write_u64,
+ .private = FILE_SELECTIVE_BOOST,
+ },
+
{ } /* terminate */
};
@@ -1956,11 +2014,14 @@
return ERR_PTR(-ENOMEM);
if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
goto free_cs;
+ if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+ goto free_allowed;
if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
- goto free_cpus;
+ goto free_requested;
set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
cpumask_clear(cs->cpus_allowed);
+ cpumask_clear(cs->cpus_requested);
nodes_clear(cs->mems_allowed);
cpumask_clear(cs->effective_cpus);
nodes_clear(cs->effective_mems);
@@ -1969,7 +2030,9 @@
return &cs->css;
-free_cpus:
+free_requested:
+ free_cpumask_var(cs->cpus_requested);
+free_allowed:
free_cpumask_var(cs->cpus_allowed);
free_cs:
kfree(cs);
@@ -2032,6 +2095,7 @@
cs->mems_allowed = parent->mems_allowed;
cs->effective_mems = parent->mems_allowed;
cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+ cpumask_copy(cs->cpus_requested, parent->cpus_requested);
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
@@ -2066,6 +2130,7 @@
free_cpumask_var(cs->effective_cpus);
free_cpumask_var(cs->cpus_allowed);
+ free_cpumask_var(cs->cpus_requested);
kfree(cs);
}
@@ -2107,6 +2172,7 @@
.css_offline = cpuset_css_offline,
.css_free = cpuset_css_free,
.can_attach = cpuset_can_attach,
+ .allow_attach = cpuset_allow_attach,
.cancel_attach = cpuset_cancel_attach,
.attach = cpuset_attach,
.post_attach = cpuset_post_attach,
@@ -2130,8 +2196,11 @@
BUG();
if (!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL))
BUG();
+ if (!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL))
+ BUG();
cpumask_setall(top_cpuset.cpus_allowed);
+ cpumask_setall(top_cpuset.cpus_requested);
nodes_setall(top_cpuset.mems_allowed);
cpumask_setall(top_cpuset.effective_cpus);
nodes_setall(top_cpuset.effective_mems);
@@ -2265,7 +2334,7 @@
goto retry;
}
- cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+ cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
diff --git a/kernel/crash_dump.c b/kernel/crash_dump.c
old mode 100644
new mode 100755
diff --git a/kernel/cred.c b/kernel/cred.c
old mode 100644
new mode 100755
index ff8606f..4e6eadc
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -19,6 +19,10 @@
#include <linux/binfmts.h>
#include <linux/cn_proc.h>
+#ifdef CONFIG_RKP_KDP
+#include <linux/slub_def.h>
+#endif
+
#if 0
#define kdebug(FMT, ...) \
printk("[%-5.5s%5u] " FMT "\n", \
@@ -36,11 +40,92 @@
/* init to 2 - one for init_task, one to ensure it is never freed */
struct group_info init_groups = { .usage = ATOMIC_INIT(2) };
+#ifdef CONFIG_RKP_KDP
+int rkp_cred_enable __kdp_ro = 0;
+
+static struct kmem_cache *cred_jar_ro;
+struct kmem_cache *tsec_jar;
+struct kmem_cache *usecnt_jar;
+atomic_t init_cred_use_cnt = ATOMIC_INIT(4);
+
+unsigned int rkp_get_usecount(struct cred *cred)
+{
+ if (rkp_ro_page((unsigned long )cred))
+ return (unsigned int)rocred_uc_read(cred);
+ else
+ return atomic_read(&cred->usage);
+}
+
+struct cred *get_new_cred(struct cred *cred)
+{
+ if (rkp_ro_page((unsigned long)cred))
+ rocred_uc_inc(cred);
+ else
+ atomic_inc(&cred->usage);
+ return cred;
+}
+
+void put_cred(const struct cred *_cred)
+{
+ struct cred *cred = (struct cred *) _cred;
+
+ validate_creds(cred);
+
+ if (rkp_ro_page((unsigned long)cred)) {
+ if (rocred_uc_dec_and_test(cred)) {
+ __put_cred(cred);
+ }
+ } else {
+ if (atomic_dec_and_test(&(cred)->usage))
+ __put_cred(cred);
+ }
+}
+
+int rkp_from_tsec_jar(unsigned long addr)
+{
+ static void *objp;
+ static struct kmem_cache *s;
+ static struct page *page;
+
+ objp = (void *)addr;
+
+ if(!objp)
+ return 0;
+
+ page = virt_to_head_page(objp);
+ s = page->slab_cache;
+ if(s && s->name) {
+ if(!strcmp(s->name,"tsec_jar")) {
+ return 1;
+ }
+ }
+ return 0;
+}
+int chk_invalid_kern_ptr(u64 tsec)
+{
+ return (((u64)tsec >> 36) != (u64)0xFFFFFFC);
+}
+void rkp_free_security(unsigned long tsec)
+{
+ if(!tsec ||
+ chk_invalid_kern_ptr(tsec))
+ return;
+
+ if(rkp_ro_page(tsec) &&
+ rkp_from_tsec_jar(tsec)){
+ kmem_cache_free(tsec_jar,(void *)tsec);
+ }
+ else {
+ kfree((void *)tsec);
+ }
+}
+#endif /* CONFIG_RKP_KDP */
/*
* The initial credentials for the initial task
*/
-struct cred init_cred = {
+/* CONFIG_RKP_KDP */
+struct cred init_cred __kdp_ro = {
.usage = ATOMIC_INIT(4),
#ifdef CONFIG_DEBUG_CREDENTIALS
.subscribers = ATOMIC_INIT(2),
@@ -62,8 +147,47 @@
.user = INIT_USER,
.user_ns = &init_user_ns,
.group_info = &init_groups,
+#ifdef CONFIG_RKP_KDP
+ .use_cnt = &init_cred_use_cnt,
+ .bp_task = &init_task,
+ .bp_pgd = (void *) 0,
+ .type = 0,
+#endif /*CONFIG_RKP_KDP*/
};
+#ifdef CONFIG_RKP_KDP
+void rkp_get_init_cred(void)
+{
+ if (rkp_ro_page((unsigned long)&init_cred))
+ rocred_uc_inc((&init_cred));
+ else
+ atomic_inc(&init_cred.usage);
+}
+EXPORT_SYMBOL(rkp_get_init_cred);
+
+/* We use another function to free protected creds. */
+static void put_ro_cred_rcu(struct rcu_head *rcu)
+{
+ struct cred *cred = container_of(rcu, struct ro_rcu_head, rcu)->bp_cred;
+ if (rocred_uc_read(cred) != 0)
+ panic("RO_CRED: put_ro_cred_rcu() sees %p with usage %d\n",
+ cred, rocred_uc_read(cred));
+
+ security_cred_free(cred);
+ key_put(cred->session_keyring);
+ key_put(cred->process_keyring);
+ key_put(cred->thread_keyring);
+ key_put(cred->request_key_auth);
+ if (cred->group_info)
+ put_group_info(cred->group_info);
+ free_uid(cred->user);
+ put_user_ns(cred->user_ns);
+ if(cred->use_cnt)
+ kmem_cache_free(usecnt_jar,(void *)cred->use_cnt);
+ kmem_cache_free(cred_jar_ro, cred);
+}
+#endif /*CONFIG_RKP_KDP*/
+
static inline void set_cred_subscribers(struct cred *cred, int n)
{
#ifdef CONFIG_DEBUG_CREDENTIALS
@@ -137,6 +261,11 @@
atomic_read(&cred->usage),
read_cred_subscribers(cred));
+#ifdef CONFIG_RKP_KDP
+ if (rkp_ro_page((unsigned long)cred))
+ BUG_ON((rocred_uc_read(cred)) != 0);
+ else
+#endif /*CONFIG_RKP_KDP*/
BUG_ON(atomic_read(&cred->usage) != 0);
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(cred) != 0);
@@ -146,6 +275,11 @@
BUG_ON(cred == current->cred);
BUG_ON(cred == current->real_cred);
+#ifdef CONFIG_RKP_KDP
+ if (rkp_ro_page((unsigned long)cred)) {
+ call_rcu(&(get_rocred_rcu(cred)->rcu), put_ro_cred_rcu);
+ } else
+#endif /*CONFIG_RKP_KDP*/
call_rcu(&cred->rcu, put_cred_rcu);
}
EXPORT_SYMBOL(__put_cred);
@@ -187,13 +321,28 @@
const struct cred *get_task_cred(struct task_struct *task)
{
const struct cred *cred;
+#ifdef CONFIG_RKP_KDP
+ int inc_test;
+#endif /*CONFIG_RKP_KDP*/
rcu_read_lock();
+#ifdef CONFIG_RKP_KDP
+ do {
+ cred = __task_cred((task));
+ BUG_ON(!cred);
+ if (rkp_ro_page((unsigned long)cred)) {
+ inc_test = rocred_uc_inc_not_zero(cred);
+ }
+ else
+ inc_test = atomic_inc_not_zero(&((struct cred *)cred)->usage);
+ } while (!inc_test);
+#else
do {
cred = __task_cred((task));
BUG_ON(!cred);
} while (!atomic_inc_not_zero(&((struct cred *)cred)->usage));
+#endif /*CONFIG_RKP_KDP*/
rcu_read_unlock();
return cred;
@@ -285,6 +434,68 @@
}
EXPORT_SYMBOL(prepare_creds);
+#ifdef CONFIG_RKP_KDP
+static struct cred *prepare_ro_creds(struct cred *old, int kdp_cmd, u64 p)
+{
+ u64 pgd =(u64)(current->mm?current->mm->pgd:swapper_pg_dir);
+ struct cred *new_ro;
+ void *use_cnt_ptr = NULL;
+ void *rcu_ptr = NULL;
+ void *tsec = NULL;
+ cred_param_t cred_param;
+ new_ro = kmem_cache_alloc(cred_jar_ro, GFP_KERNEL);
+ if (!new_ro)
+ panic("[%d] : kmem_cache_alloc() failed", kdp_cmd);
+
+ use_cnt_ptr = kmem_cache_alloc(usecnt_jar,GFP_KERNEL);
+ if (!use_cnt_ptr)
+ panic("[%d] : Unable to allocate usage pointer\n", kdp_cmd);
+
+ rcu_ptr = get_usecnt_rcu(use_cnt_ptr);
+ ((struct ro_rcu_head*)rcu_ptr)->bp_cred = (void *)new_ro;
+
+ tsec = kmem_cache_alloc(tsec_jar, GFP_KERNEL);
+ if (!tsec)
+ panic("[%d] : Unable to allocate security pointer\n", kdp_cmd);
+
+ rkp_cred_fill_params(old,new_ro,use_cnt_ptr,tsec,kdp_cmd,p);
+ uh_call(UH_APP_RKP, RKP_KDP_X46, (u64)&cred_param, 0, 0, 0);
+ if (kdp_cmd == RKP_CMD_COPY_CREDS) {
+ if ((new_ro->bp_task != (void *)p)
+ || new_ro->security != tsec
+ || new_ro->use_cnt != use_cnt_ptr) {
+ panic("[%d]: RKP Call failed task=#%p:%p#, sec=#%p:%p#, usecnt=#%p:%p#", kdp_cmd, new_ro->bp_task,(void *)p,new_ro->security,tsec,new_ro->use_cnt,use_cnt_ptr);
+ }
+ }
+ else {
+ if ((new_ro->bp_task != current)||
+ (current->mm
+ && new_ro->bp_pgd != (void *)pgd) ||
+ (new_ro->security != tsec) ||
+ (new_ro->use_cnt != use_cnt_ptr)) {
+ panic("[%d]: RKP Call failed task=#%p:%p#, sec=#%p:%p#, usecnt=#%p:%p#, pgd=#%p:%p#", kdp_cmd, new_ro->bp_task,current,new_ro->security,tsec,new_ro->use_cnt,use_cnt_ptr,new_ro->bp_pgd,(void *)pgd);
+ }
+ }
+
+ rocred_uc_set(new_ro, 2);
+
+ set_cred_subscribers(new_ro, 0);
+ get_group_info(new_ro->group_info);
+ get_uid(new_ro->user);
+ get_user_ns(new_ro->user_ns);
+
+#ifdef CONFIG_KEYS
+ key_get(new_ro->session_keyring);
+ key_get(new_ro->process_keyring);
+ key_get(new_ro->thread_keyring);
+ key_get(new_ro->request_key_auth);
+#endif
+
+ validate_creds(new_ro);
+ return new_ro;
+}
+#endif /* CONFIG_RKP_KDP */
+
/*
* Prepare credentials for current to perform an execve()
* - The caller must hold ->cred_guard_mutex
@@ -324,6 +535,18 @@
struct cred *new;
int ret;
+#ifdef CONFIG_RKP_KDP
+ /*
+ * Disabling cred sharing among the same thread group. This
+ * is needed because we only added one back pointer in cred.
+ *
+ * This should NOT in any way change kernel logic, if we think about what
+ * happens when a thread needs to change its credentials: it will just
+ * create a new one, while all other threads in the same thread group still
+ * reference the old one, whose reference counter decreases by 2.
+ */
+ if(!rkp_cred_enable){
+#endif /* CONFIG_RKP_KDP */
if (
#ifdef CONFIG_KEYS
!p->cred->thread_keyring &&
@@ -339,6 +562,9 @@
atomic_inc(&p->cred->user->processes);
return 0;
}
+#ifdef CONFIG_RKP_KDP
+ }
+#endif /* CONFIG_RKP_KDP */
new = prepare_creds();
if (!new)
@@ -370,9 +596,24 @@
#endif
atomic_inc(&new->user->processes);
+#ifdef CONFIG_RKP_KDP
+ if(rkp_cred_enable){
+ struct cred *new_ro;
+ new_ro = prepare_ro_creds(new, RKP_CMD_COPY_CREDS, (u64)p);
+ p->cred = p->real_cred = new_ro;
+ put_cred(new);
+ }
+ else {
+ p->cred = p->real_cred = get_cred(new);
+ alter_cred_subscribers(new, 2);
+ validate_creds(new);
+ }
+#else
p->cred = p->real_cred = get_cred(new);
alter_cred_subscribers(new, 2);
validate_creds(new);
+#endif /* CONFIG_RKP_KDP */
+
return 0;
error_put:
@@ -434,6 +675,11 @@
validate_creds(old);
validate_creds(new);
#endif
+#ifdef CONFIG_RKP_KDP
+ if (rkp_ro_page((unsigned long)new))
+ BUG_ON((rocred_uc_read(new)) < 1);
+ else
+#endif
BUG_ON(atomic_read(&new->usage) < 1);
get_cred(new); /* we will require a ref for the subj creds too */
@@ -463,8 +709,23 @@
alter_cred_subscribers(new, 2);
if (new->user != old->user)
atomic_inc(&new->user->processes);
+#ifdef CONFIG_RKP_KDP
+ if(rkp_cred_enable) {
+ struct cred *new_ro;
+
+ new_ro = prepare_ro_creds(new, RKP_CMD_CMMIT_CREDS, 0);
+
+ rcu_assign_pointer(task->real_cred, new_ro);
+ rcu_assign_pointer(task->cred, new_ro);
+ }
+ else {
+ rcu_assign_pointer(task->real_cred, new);
+ rcu_assign_pointer(task->cred, new);
+ }
+#else
rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new);
+#endif /* CONFIG_RKP_KDP */
if (new->user != old->user)
atomic_dec(&old->user->processes);
alter_cred_subscribers(old, -2);
@@ -482,6 +743,13 @@
!gid_eq(new->fsgid, old->fsgid))
proc_id_connector(task, PROC_EVENT_GID);
+#ifdef CONFIG_RKP_KDP
+ if (rkp_cred_enable){
+ put_cred(new);
+ put_cred(new);
+ }
+#endif /* CONFIG_RKP_KDP */
+
/* release the old obj and subj refs both */
put_cred(old);
put_cred(old);
@@ -505,6 +773,11 @@
#ifdef CONFIG_DEBUG_CREDENTIALS
BUG_ON(read_cred_subscribers(new) != 0);
#endif
+#ifdef CONFIG_RKP_KDP
+ if (rkp_ro_page((unsigned long)new))
+ BUG_ON((rocred_uc_read(new)) < 1);
+ else
+#endif /* CONFIG_RKP_KDP */
BUG_ON(atomic_read(&new->usage) < 1);
put_cred(new);
}
@@ -517,9 +790,16 @@
* Install a set of temporary override subjective credentials on the current
* process, returning the old set for later reversion.
*/
+#ifdef CONFIG_RKP_KDP
+const struct cred *rkp_override_creds(struct cred **cnew)
+#else
const struct cred *override_creds(const struct cred *new)
+#endif /* CONFIG_RKP_KDP */
{
const struct cred *old = current->cred;
+#ifdef CONFIG_RKP_KDP
+ struct cred *new = *cnew;
+#endif /* CONFIG_RKP_KDP */
kdebug("override_creds(%p{%d,%d})", new,
atomic_read(&new->usage),
@@ -527,9 +807,26 @@
validate_creds(old);
validate_creds(new);
+#ifdef CONFIG_RKP_KDP
+ if(rkp_cred_enable) {
+ volatile unsigned int rkp_use_count = rkp_get_usecount(new);
+ struct cred *new_ro;
+
+ new_ro = prepare_ro_creds(new, RKP_CMD_OVRD_CREDS, rkp_use_count);
+ rcu_assign_pointer(current->cred, new_ro);
+ *cnew = new_ro;
+ put_cred(new);
+ }
+ else {
+ get_cred(new);
+ alter_cred_subscribers(new, 1);
+ rcu_assign_pointer(current->cred, new);
+ }
+#else
get_cred(new);
alter_cred_subscribers(new, 1);
rcu_assign_pointer(current->cred, new);
+#endif /* CONFIG_RKP_KDP */
alter_cred_subscribers(old, -1);
kdebug("override_creds() = %p{%d,%d}", old,
@@ -537,7 +834,11 @@
read_cred_subscribers(old));
return old;
}
+#ifdef CONFIG_RKP_KDP
+EXPORT_SYMBOL(rkp_override_creds);
+#else
EXPORT_SYMBOL(override_creds);
+#endif /* CONFIG_RKP_KDP */
/**
* revert_creds - Revert a temporary subjective credentials override
@@ -563,6 +864,22 @@
}
EXPORT_SYMBOL(revert_creds);
+#ifdef CONFIG_RKP_KDP
+void cred_ctor(void *data)
+{
+ /* Dummy constructor to make sure we have separate slabs caches. */
+}
+void sec_ctor(void *data)
+{
+ /* Dummy constructor to make sure we have separate slabs caches. */
+ //printk("\n initializing sec_ctor = %p \n",data);
+}
+void usecnt_ctor(void *data)
+{
+ /* Dummy constructor to make sure we have separate slabs caches. */
+}
+#endif /* CONFIG_RKP_KDP */
+
/*
* initialise the credentials stuff
*/
@@ -571,6 +888,28 @@
/* allocate a slab in which we can store credentials */
cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred),
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+#ifdef CONFIG_RKP_KDP
+ if(rkp_cred_enable) {
+ cred_jar_ro = kmem_cache_create("cred_jar_ro", sizeof(struct cred),
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, cred_ctor);
+ if(!cred_jar_ro) {
+ panic("Unable to create RO Cred cache\n");
+ }
+
+ tsec_jar = kmem_cache_create("tsec_jar", rkp_get_task_sec_size(),
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, sec_ctor);
+ if(!tsec_jar) {
+ panic("Unable to create RO security cache\n");
+ }
+
+ usecnt_jar = kmem_cache_create("usecnt_jar", sizeof(atomic_t) + sizeof(struct ro_rcu_head),
+ 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, usecnt_ctor);
+ if(!usecnt_jar) {
+ panic("Unable to create use count jar\n");
+ }
+ uh_call(UH_APP_RKP, RKP_KDP_X42, (u64)cred_jar_ro->size, (u64)tsec_jar->size, 0, 0);
+ }
+#endif /* CONFIG_RKP_KDP */
}
/**
@@ -733,6 +1072,14 @@
cred == tsk->cred ? "[eff]" : "");
printk(KERN_ERR "CRED: ->magic=%x, put_addr=%p\n",
cred->magic, cred->put_addr);
+#ifdef CONFIG_RKP_KDP
+ if (rkp_ro_page((unsigned long)cred)) {
+ printk(KERN_ERR "CRED: ->usage(FROM ARRAY)=%d, subscr=%d\n",
+ rkp_get_usecount(cred),
+ read_cred_subscribers(cred));
+ }
+ else
+#endif /* CONFIG_RKP_KDP */
printk(KERN_ERR "CRED: ->usage=%d, subscr=%d\n",
atomic_read(&cred->usage),
read_cred_subscribers(cred));
diff --git a/kernel/debug/Makefile b/kernel/debug/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c
old mode 100644
new mode 100755
diff --git a/kernel/debug/debug_core.h b/kernel/debug/debug_core.h
old mode 100644
new mode 100755
diff --git a/kernel/debug/gdbstub.c b/kernel/debug/gdbstub.c
old mode 100644
new mode 100755
diff --git a/kernel/debug/kdb/.gitignore b/kernel/debug/kdb/.gitignore
deleted file mode 100644
index 396d12e..0000000
--- a/kernel/debug/kdb/.gitignore
+++ /dev/null
@@ -1 +0,0 @@
-gen-kdb_cmds.c
diff --git a/kernel/debug/kdb/Makefile b/kernel/debug/kdb/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c
old mode 100644
new mode 100755
index e1dbf4a..90ff129
--- a/kernel/debug/kdb/kdb_bp.c
+++ b/kernel/debug/kdb/kdb_bp.c
@@ -153,13 +153,11 @@
} else {
kdb_printf("%s: failed to set breakpoint at 0x%lx\n",
__func__, bp->bp_addr);
-#ifdef CONFIG_DEBUG_RODATA
if (!bp->bp_type) {
kdb_printf("Software breakpoints are unavailable.\n"
- " Change the kernel CONFIG_DEBUG_RODATA=n\n"
+ " Boot the kernel with rodata=off\n"
" OR use hw breaks: help bph\n");
}
-#endif
return 1;
}
return 0;
diff --git a/kernel/debug/kdb/kdb_bt.c b/kernel/debug/kdb/kdb_bt.c
old mode 100644
new mode 100755
diff --git a/kernel/debug/kdb/kdb_cmds b/kernel/debug/kdb/kdb_cmds
old mode 100644
new mode 100755
diff --git a/kernel/debug/kdb/kdb_debugger.c b/kernel/debug/kdb/kdb_debugger.c
old mode 100644
new mode 100755
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c
old mode 100644
new mode 100755
index cc892a9..d3c5b15
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -216,7 +216,7 @@
int i;
int diag, dtab_count;
int key, buf_size, ret;
-
+ static int last_crlf;
diag = kdbgetintenv("DTABCOUNT", &dtab_count);
if (diag)
@@ -237,6 +237,9 @@
return buffer;
if (key != 9)
tab = 0;
+ if (key != 10 && key != 13)
+ last_crlf = 0;
+
switch (key) {
case 8: /* backspace */
if (cp > buffer) {
@@ -254,7 +257,12 @@
*cp = tmp;
}
break;
- case 13: /* enter */
+ case 10: /* new line */
+ case 13: /* carriage return */
+ /* handle \n after \r */
+ if (last_crlf && last_crlf != key)
+ break;
+ last_crlf = key;
*lastchar++ = '\n';
*lastchar++ = '\0';
if (!KDB_STATE(KGDB_TRANS)) {
diff --git a/kernel/debug/kdb/kdb_keyboard.c b/kernel/debug/kdb/kdb_keyboard.c
old mode 100644
new mode 100755
diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c
old mode 100644
new mode 100755
diff --git a/kernel/debug/kdb/kdb_private.h b/kernel/debug/kdb/kdb_private.h
old mode 100644
new mode 100755
diff --git a/kernel/debug/kdb/kdb_support.c b/kernel/debug/kdb/kdb_support.c
old mode 100644
new mode 100755
diff --git a/kernel/delayacct.c b/kernel/delayacct.c
old mode 100644
new mode 100755
index ef90b04..ff96d59
--- a/kernel/delayacct.c
+++ b/kernel/delayacct.c
@@ -144,6 +144,19 @@
return ret;
}
+#ifdef CONFIG_PAGE_BOOST
+__u64 __delayacct_blkio_nsecs(struct task_struct *tsk)
+{
+ __u64 ret;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tsk->delays->lock, flags);
+ ret = tsk->delays->blkio_delay + tsk->delays->swapin_delay;
+ spin_unlock_irqrestore(&tsk->delays->lock, flags);
+ return ret;
+}
+#endif
+
void __delayacct_freepages_start(void)
{
current->delays->freepages_start = ktime_get_ns();
diff --git a/kernel/dma.c b/kernel/dma.c
old mode 100644
new mode 100755
diff --git a/kernel/elfcore.c b/kernel/elfcore.c
old mode 100644
new mode 100755
diff --git a/kernel/events/Makefile b/kernel/events/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c
old mode 100644
new mode 100755
diff --git a/kernel/events/core.c b/kernel/events/core.c
old mode 100644
new mode 100755
index e53dfb5..0625ebe
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -175,8 +175,13 @@
* 0 - disallow raw tracepoint access for unpriv
* 1 - disallow cpu events for unpriv
* 2 - disallow kernel profiling for unpriv
+ * 3 - disallow all unpriv perf event use
*/
+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
int sysctl_perf_event_paranoid __read_mostly = 1;
+#endif
/* Minimum for 512 kiB + 1 user control page */
int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -1492,10 +1497,17 @@
* If this was a group event with sibling events then
* upgrade the siblings to singleton events by adding them
* to whatever list we are on.
+ * If this isn't on a list, make sure we still remove the sibling's
+ * group_entry from this sibling_list; otherwise, when that sibling
+ * is later deallocated, it will try to remove itself from this
+ * sibling_list, which may well have been deallocated already,
+ * resulting in a use-after-free.
*/
list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
if (list)
list_move_tail(&sibling->group_entry, list);
+ else
+ list_del_init(&sibling->group_entry);
sibling->group_leader = sibling;
/* Inherit group flags from the previous leader */
@@ -8331,6 +8343,9 @@
if (flags & ~PERF_FLAG_ALL)
return -EINVAL;
+ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+ return -EACCES;
+
err = perf_copy_attr(attr_uptr, &attr);
if (err)
return err;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
old mode 100644
new mode 100755
diff --git a/kernel/events/internal.h b/kernel/events/internal.h
old mode 100644
new mode 100755
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
old mode 100644
new mode 100755
diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c
old mode 100644
new mode 100755
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
old mode 100644
new mode 100755
diff --git a/kernel/exit.c b/kernel/exit.c
old mode 100644
new mode 100755
index 03f6722..42a4fa9
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -53,12 +53,19 @@
#include <linux/oom.h>
#include <linux/writeback.h>
#include <linux/shm.h>
+#include <linux/kcov.h>
+
+#include "sched/tune.h"
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
+#ifdef CONFIG_SECURITY_DEFEX
+#include <linux/defex.h>
+#endif
+
static void exit_mm(struct task_struct *tsk);
static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -664,7 +671,12 @@
int group_dead;
TASKS_RCU(int tasks_rcu_i);
+#ifdef CONFIG_SECURITY_DEFEX
+ task_defex_zero_creds(current);
+#endif
+
profile_task_exit(tsk);
+ kcov_task_exit(tsk);
WARN_ON(blk_needs_flush_plug(tsk));
@@ -707,6 +719,9 @@
}
exit_signals(tsk); /* sets PF_EXITING */
+
+ schedtune_exit_task(tsk);
+
/*
* tsk->flags are checked in the futex code to protect against
* an exiting task cleaning up the robust pi futexes.
@@ -754,7 +769,7 @@
disassociate_ctty(1);
exit_task_namespaces(tsk);
exit_task_work(tsk);
- exit_thread();
+ exit_thread(tsk);
/*
* Flush inherited counters to the parent - before the parent
diff --git a/kernel/extable.c b/kernel/extable.c
old mode 100644
new mode 100755
diff --git a/kernel/fork.c b/kernel/fork.c
old mode 100644
new mode 100755
index e4b8191..c0bf85d
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -76,6 +76,9 @@
#include <linux/aio.h>
#include <linux/compiler.h>
#include <linux/sysctl.h>
+#include <linux/kcov.h>
+#include <linux/cpufreq_times.h>
+#include <linux/task_integrity.h>
#include <asm/pgtable.h>
#include <asm/pgalloc.h>
@@ -89,6 +92,10 @@
#define CREATE_TRACE_POINTS
#include <trace/events/task.h>
+#ifdef CONFIG_SECURITY_DEFEX
+#include <linux/defex.h>
+#endif
+
/*
* Minimum number of threads to boot the kernel
*/
@@ -148,18 +155,18 @@
}
#endif
-void __weak arch_release_thread_info(struct thread_info *ti)
+void __weak arch_release_thread_stack(unsigned long *stack)
{
}
-#ifndef CONFIG_ARCH_THREAD_INFO_ALLOCATOR
+#ifndef CONFIG_ARCH_THREAD_STACK_ALLOCATOR
/*
* Allocate pages if THREAD_SIZE is >= PAGE_SIZE, otherwise use a
* kmemcache based allocator.
*/
# if THREAD_SIZE >= PAGE_SIZE
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP,
@@ -168,30 +175,32 @@
return page ? page_address(page) : NULL;
}
-static inline void free_thread_info(struct thread_info *ti)
+static inline void free_thread_stack(unsigned long *stack)
{
- kaiser_unmap_thread_stack(ti);
- free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER);
+ struct page *page = virt_to_page(stack);
+
+ kaiser_unmap_thread_stack(stack);
+ __free_kmem_pages(page, THREAD_SIZE_ORDER);
}
# else
-static struct kmem_cache *thread_info_cache;
+static struct kmem_cache *thread_stack_cache;
-static struct thread_info *alloc_thread_info_node(struct task_struct *tsk,
+static unsigned long *alloc_thread_stack_node(struct task_struct *tsk,
int node)
{
- return kmem_cache_alloc_node(thread_info_cache, THREADINFO_GFP, node);
+ return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node);
}
-static void free_thread_info(struct thread_info *ti)
+static void free_thread_stack(unsigned long *stack)
{
- kmem_cache_free(thread_info_cache, ti);
+ kmem_cache_free(thread_stack_cache, stack);
}
-void thread_info_cache_init(void)
+void thread_stack_cache_init(void)
{
- thread_info_cache = kmem_cache_create("thread_info", THREAD_SIZE,
+ thread_stack_cache = kmem_cache_create("thread_stack", THREAD_SIZE,
THREAD_SIZE, 0, NULL);
- BUG_ON(thread_info_cache == NULL);
+ BUG_ON(thread_stack_cache == NULL);
}
# endif
#endif
@@ -214,18 +223,19 @@
/* SLAB cache for mm_struct structures (tsk->mm) */
static struct kmem_cache *mm_cachep;
-static void account_kernel_stack(struct thread_info *ti, int account)
+static void account_kernel_stack(unsigned long *stack, int account)
{
- struct zone *zone = page_zone(virt_to_page(ti));
+ struct zone *zone = page_zone(virt_to_page(stack));
mod_zone_page_state(zone, NR_KERNEL_STACK, account);
}
void free_task(struct task_struct *tsk)
{
+ cpufreq_task_times_exit(tsk);
account_kernel_stack(tsk->stack, -1);
- arch_release_thread_info(tsk->stack);
- free_thread_info(tsk->stack);
+ arch_release_thread_stack(tsk->stack);
+ free_thread_stack(tsk->stack);
rt_mutex_debug_task_free(tsk);
ftrace_graph_exit_task(tsk);
put_seccomp_filter(tsk);
@@ -336,7 +346,7 @@
static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
{
struct task_struct *tsk;
- struct thread_info *ti;
+ unsigned long *stack;
int err;
if (node == NUMA_NO_NODE)
@@ -345,19 +355,19 @@
if (!tsk)
return NULL;
- ti = alloc_thread_info_node(tsk, node);
- if (!ti)
+ stack = alloc_thread_stack_node(tsk, node);
+ if (!stack)
goto free_tsk;
err = arch_dup_task_struct(tsk, orig);
if (err)
- goto free_ti;
+ goto free_stack;
- tsk->stack = ti;
+ tsk->stack = stack;
err = kaiser_map_thread_stack(tsk->stack);
if (err)
- goto free_ti;
+ goto free_stack;
#ifdef CONFIG_SECCOMP
/*
* We must handle setting up seccomp filters once we're under
@@ -389,12 +399,14 @@
tsk->task_frag.page = NULL;
tsk->wake_q.next = NULL;
- account_kernel_stack(ti, 1);
+ account_kernel_stack(stack, 1);
+
+ kcov_task_init(tsk);
return tsk;
-free_ti:
- free_thread_info(ti);
+free_stack:
+ free_thread_stack(stack);
free_tsk:
free_task_struct(tsk);
return NULL;
@@ -699,6 +711,26 @@
}
EXPORT_SYMBOL_GPL(__mmdrop);
+static inline void __mmput(struct mm_struct *mm)
+{
+ VM_BUG_ON(atomic_read(&mm->mm_users));
+
+ uprobe_clear_state(mm);
+ exit_aio(mm);
+ ksm_exit(mm);
+ khugepaged_exit(mm); /* must run before exit_mmap */
+ exit_mmap(mm);
+ set_mm_exe_file(mm, NULL);
+ if (!list_empty(&mm->mmlist)) {
+ spin_lock(&mmlist_lock);
+ list_del(&mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ if (mm->binfmt)
+ module_put(mm->binfmt->module);
+ mmdrop(mm);
+}
+
/*
* Decrement the use count and release all resources for an mm.
*/
@@ -706,25 +738,25 @@
{
might_sleep();
- if (atomic_dec_and_test(&mm->mm_users)) {
- uprobe_clear_state(mm);
- exit_aio(mm);
- ksm_exit(mm);
- khugepaged_exit(mm); /* must run before exit_mmap */
- exit_mmap(mm);
- set_mm_exe_file(mm, NULL);
- if (!list_empty(&mm->mmlist)) {
- spin_lock(&mmlist_lock);
- list_del(&mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- if (mm->binfmt)
- module_put(mm->binfmt->module);
- mmdrop(mm);
- }
+ if (atomic_dec_and_test(&mm->mm_users))
+ __mmput(mm);
}
EXPORT_SYMBOL_GPL(mmput);
+static void mmput_async_fn(struct work_struct *work)
+{
+ struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work);
+ __mmput(mm);
+}
+
+void mmput_async(struct mm_struct *mm)
+{
+ if (atomic_dec_and_test(&mm->mm_users)) {
+ INIT_WORK(&mm->async_put_work, mmput_async_fn);
+ schedule_work(&mm->async_put_work);
+ }
+}
+
/**
* set_mm_exe_file - change a reference to the mm's executable file
*
@@ -1265,6 +1297,69 @@
task->pids[type].pid = pid;
}
+#ifdef CONFIG_FIVE
+static int dup_task_integrity(unsigned long clone_flags,
+ struct task_struct *tsk)
+{
+ int ret = 0;
+
+ if (clone_flags & CLONE_VM) {
+ task_integrity_get(current->integrity);
+ tsk->integrity = current->integrity;
+ } else {
+ tsk->integrity = task_integrity_alloc();
+
+ if (!tsk->integrity)
+ ret = -ENOMEM;
+ }
+
+ return ret;
+}
+
+static inline void task_integrity_cleanup(struct task_struct *tsk)
+{
+ task_integrity_put(tsk->integrity);
+}
+
+static inline int task_integrity_apply(unsigned long clone_flags,
+ struct task_struct *tsk)
+{
+ int ret = 0;
+
+ if (!(clone_flags & CLONE_VM))
+ ret = five_fork(current, tsk);
+
+ return ret;
+}
+#else
+static inline int dup_task_integrity(unsigned long clone_flags,
+ struct task_struct *tsk)
+{
+ return 0;
+}
+
+static inline void task_integrity_cleanup(struct task_struct *tsk)
+{
+}
+
+static inline int task_integrity_apply(unsigned long clone_flags,
+ struct task_struct *tsk)
+{
+ return 0;
+}
+
+#endif
+
+#ifdef CONFIG_RKP_KDP
+void rkp_assign_pgd(struct task_struct *p)
+{
+ u64 pgd;
+ pgd = (u64)(p->mm ? p->mm->pgd :swapper_pg_dir);
+
+ uh_call(UH_APP_RKP, RKP_KDP_X43, (u64)p->cred, (u64)pgd, 0, 0);
+}
+#endif /*CONFIG_RKP_KDP*/
+
/*
* This creates a new process as a copy of the old one,
* but does not actually start it yet.
@@ -1337,6 +1432,8 @@
if (!p)
goto fork_out;
+ cpufreq_task_times_init(p);
+
/*
* This _must_ happen before we call free_task(), i.e. before we jump
* to any of the bad_fork_* labels. This is to avoid freeing
@@ -1393,6 +1490,7 @@
p->utime = p->stime = p->gtime = 0;
p->utimescaled = p->stimescaled = 0;
prev_cputime_init(&p->prev_cputime);
+ p->cpu_power = 0;
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
seqlock_init(&p->vtime_seqlock);
@@ -1570,6 +1668,10 @@
if (retval)
goto bad_fork_free_pid;
+ retval = dup_task_integrity(clone_flags, p);
+ if (retval)
+ goto bad_fork_free_pid;
+
/*
* From this point on we must avoid any synchronous user-space
* communication until we take the tasklist-lock. In particular, we do
@@ -1622,6 +1724,10 @@
goto bad_fork_cancel_cgroup;
}
+ retval = task_integrity_apply(clone_flags, p);
+ if (retval)
+ goto bad_fork_cancel_cgroup;
+
if (likely(p->pid)) {
ptrace_init_task(p, (clone_flags & CLONE_PTRACE) || trace);
@@ -1667,13 +1773,17 @@
trace_task_newtask(p, clone_flags);
uprobe_copy_process(p, clone_flags);
-
+#ifdef CONFIG_RKP_KDP
+ if(rkp_cred_enable)
+ rkp_assign_pgd(p);
+#endif/*CONFIG_RKP_KDP*/
return p;
bad_fork_cancel_cgroup:
spin_unlock(¤t->sighand->siglock);
write_unlock_irq(&tasklist_lock);
cgroup_cancel_fork(p, cgrp_ss_priv);
+ task_integrity_cleanup(p);
bad_fork_free_pid:
threadgroup_change_end(current);
if (pid != &init_struct_pid)
@@ -1784,11 +1894,17 @@
struct completion vfork;
struct pid *pid;
+ cpufreq_task_times_alloc(p);
+
trace_sched_process_fork(current, p);
pid = get_task_pid(p, PIDTYPE_PID);
nr = pid_vnr(pid);
+#ifdef CONFIG_SECURITY_DEFEX
+ task_defex_zero_creds(p);
+#endif
+
if (clone_flags & CLONE_PARENT_SETTID)
put_user(nr, parent_tidptr);
diff --git a/kernel/freezer.c b/kernel/freezer.c
old mode 100644
new mode 100755
diff --git a/kernel/futex.c b/kernel/futex.c
old mode 100644
new mode 100755
index 0c92c8d..f54ba5e
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -666,14 +666,13 @@
* this reference was taken by ihold under the page lock
* pinning the inode in place so i_lock was unnecessary. The
* only way for this check to fail is if the inode was
- * truncated in parallel which is almost certainly an
- * application bug. In such a case, just retry.
+ * truncated in parallel so warn for now if this happens.
*
* We are not calling into get_futex_key_refs() in file-backed
* cases, therefore a successful atomic_inc return below will
* guarantee that get_futex_key() will still imply smp_mb(); (B).
*/
- if (!atomic_inc_not_zero(&inode->i_count)) {
+ if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
rcu_read_unlock();
put_page(page_head);
diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c
old mode 100644
new mode 100755
diff --git a/kernel/gcov/Kconfig b/kernel/gcov/Kconfig
old mode 100644
new mode 100755
diff --git a/kernel/gcov/Makefile b/kernel/gcov/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c
old mode 100644
new mode 100755
diff --git a/kernel/gcov/fs.c b/kernel/gcov/fs.c
old mode 100644
new mode 100755
diff --git a/kernel/gcov/gcc_3_4.c b/kernel/gcov/gcc_3_4.c
old mode 100644
new mode 100755
diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c
old mode 100644
new mode 100755
diff --git a/kernel/gcov/gcov.h b/kernel/gcov/gcov.h
old mode 100644
new mode 100755
diff --git a/kernel/groups.c b/kernel/groups.c
old mode 100644
new mode 100755
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
old mode 100644
new mode 100755
index cc05b97..bc6987b
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -108,7 +108,7 @@
* Ok, the task did not get scheduled for more than 2 minutes,
* complain:
*/
- pr_err("INFO: task %s:%d blocked for more than %ld seconds.\n",
+ pr_auto(ASL1, "INFO: task %s:%d blocked for more than %ld seconds.\n",
t->comm, t->pid, timeout);
pr_err(" %s %s %.*s\n",
print_tainted(), init_utsname()->release,
diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig
old mode 100644
new mode 100755
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/irq/autoprobe.c b/kernel/irq/autoprobe.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/cpuhotplug.c b/kernel/irq/cpuhotplug.c
old mode 100644
new mode 100755
index 011f8c4..768b71b
--- a/kernel/irq/cpuhotplug.c
+++ b/kernel/irq/cpuhotplug.c
@@ -32,6 +32,8 @@
if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
affinity = cpu_online_mask;
ret = true;
+ } else if (unlikely(d->common->state_use_accessors & IRQD_GIC_MULTI_TARGET)) {
+ return false;
}
c = irq_data_get_irq_chip(d);
diff --git a/kernel/irq/debug.h b/kernel/irq/debug.h
old mode 100644
new mode 100755
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/dummychip.c b/kernel/irq/dummychip.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/generic-chip.c b/kernel/irq/generic-chip.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
old mode 100644
new mode 100755
index 57bff78..f5c249b
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -143,7 +143,9 @@
irqreturn_t res;
trace_irq_handler_entry(irq, action);
+ exynos_ss_irq(irq, (void *)action->handler, (int)irqs_disabled(), ESS_FLAG_IN);
res = action->handler(irq, action->dev_id);
+ exynos_ss_irq(irq, (void *)action->handler, (int)irqs_disabled(), ESS_FLAG_OUT);
trace_irq_handler_exit(irq, action, res);
if (WARN_ONCE(!irqs_disabled(),"irq %u handler %pF enabled interrupts\n",
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
old mode 100644
new mode 100755
diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c
old mode 100644
new mode 100755
index 239e2ae..67f8000
--- a/kernel/irq/irqdesc.c
+++ b/kernel/irq/irqdesc.c
@@ -15,6 +15,10 @@
#include <linux/radix-tree.h>
#include <linux/bitmap.h>
#include <linux/irqdomain.h>
+#include <linux/exynos-ss.h>
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+#include <linux/sec_debug.h>
+#endif
#include "internals.h"
@@ -23,11 +27,18 @@
*/
static struct lock_class_key irq_desc_lock_class;
+#ifdef CONFIG_SCHED_HMP
+extern struct cpumask hmp_slow_cpu_mask;
+#endif
#if defined(CONFIG_SMP)
static void __init init_irq_default_affinity(void)
{
alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT);
+#ifdef CONFIG_SCHED_HMP
+ cpumask_copy(irq_default_affinity, &hmp_slow_cpu_mask);
+#else
cpumask_setall(irq_default_affinity);
+#endif
}
#else
static void __init init_irq_default_affinity(void)
@@ -347,6 +358,16 @@
if (!desc)
return -EINVAL;
+
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+ if (desc->action)
+ sec_debug_irq_sched_log(irq, (void *)desc->action->handler,
+ irqs_disabled());
+ else
+ sec_debug_irq_sched_log(irq, (void *)desc->handle_irq,
+ irqs_disabled());
+#endif
+
generic_handle_irq_desc(desc);
return 0;
}
@@ -366,9 +387,11 @@
bool lookup, struct pt_regs *regs)
{
struct pt_regs *old_regs = set_irq_regs(regs);
+ unsigned long long start_time;
unsigned int irq = hwirq;
int ret = 0;
+ exynos_ss_irq_exit_var(start_time);
irq_enter();
#ifdef CONFIG_IRQ_DOMAIN
@@ -388,6 +411,7 @@
}
irq_exit();
+ exynos_ss_irq_exit(irq, start_time);
set_irq_regs(old_regs);
return ret;
}
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
old mode 100644
new mode 100755
index 83cea91..2941691
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -1328,6 +1328,9 @@
irqd_set(&desc->irq_data, IRQD_NO_BALANCING);
}
+ if (new->flags & IRQF_GIC_MULTI_TARGET)
+ irqd_set(&desc->irq_data, IRQD_GIC_MULTI_TARGET);
+
/* Set default affinity mask once everything is setup */
setup_affinity(desc, mask);
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/resend.c b/kernel/irq/resend.c
old mode 100644
new mode 100755
diff --git a/kernel/irq/settings.h b/kernel/irq/settings.h
old mode 100644
new mode 100755
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
old mode 100644
new mode 100755
diff --git a/kernel/irq_work.c b/kernel/irq_work.c
old mode 100644
new mode 100755
diff --git a/kernel/jump_label.c b/kernel/jump_label.c
old mode 100644
new mode 100755
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
old mode 100644
new mode 100755
index 5c5987f..de57b96
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -24,9 +24,16 @@
#include <linux/ctype.h>
#include <linux/slab.h>
#include <linux/compiler.h>
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+#include <linux/sec_debug.h>
+#endif
#include <asm/sections.h>
+#ifdef CONFIG_SEC_DEBUG
+#include <linux/sec_debug.h>
+#endif
+
#ifdef CONFIG_KALLSYMS_ALL
#define all_var 1
#else
@@ -52,6 +59,30 @@
extern const unsigned long kallsyms_markers[] __weak;
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+void sec_debug_summary_set_kallsyms_info(struct sec_debug_summary *summary_info)
+{
+ summary_info->ksyms.addresses_pa = __pa(kallsyms_addresses);
+ summary_info->ksyms.relative_base = 0x0;
+ summary_info->ksyms.offsets_pa = 0x0;
+
+ summary_info->ksyms.names_pa = __pa(kallsyms_names);
+ summary_info->ksyms.num_syms = kallsyms_num_syms;
+ summary_info->ksyms.token_table_pa = __pa(kallsyms_token_table);
+ summary_info->ksyms.token_index_pa = __pa(kallsyms_token_index);
+ summary_info->ksyms.markers_pa = __pa(kallsyms_markers);
+
+ summary_info->ksyms.sect.sinittext = (uint64_t)_sinittext;
+ summary_info->ksyms.sect.einittext = (uint64_t)_einittext;
+ summary_info->ksyms.sect.stext = (uint64_t)_stext;
+ summary_info->ksyms.sect.etext = (uint64_t)_etext;
+ summary_info->ksyms.sect.end = (uint64_t)_end;
+
+ summary_info->ksyms.kallsyms_all = all_var;
+ summary_info->ksyms.magic = SEC_DEBUG_SUMMARY_MAGIC1;
+}
+#endif
+
static inline int is_kernel_inittext(unsigned long addr)
{
if (addr >= (unsigned long)_sinittext
@@ -62,7 +93,7 @@
static inline int is_kernel_text(unsigned long addr)
{
- if ((addr >= (unsigned long)_stext && addr <= (unsigned long)_etext) ||
+ if ((addr >= (unsigned long)_text && addr <= (unsigned long)_etext) ||
arch_is_kernel_text(addr))
return 1;
return in_gate_area_no_mm(addr);
@@ -70,7 +101,7 @@
static inline int is_kernel(unsigned long addr)
{
- if (addr >= (unsigned long)_stext && addr <= (unsigned long)_end)
+ if (addr >= (unsigned long)_text && addr <= (unsigned long)_end)
return 1;
return in_gate_area_no_mm(addr);
}
diff --git a/kernel/kaslr.c b/kernel/kaslr.c
new file mode 100755
index 0000000..747b688
--- /dev/null
+++ b/kernel/kaslr.c
@@ -0,0 +1,57 @@
+/*
+ * Procedures for exposing kaslr debugging information.
+ *
+ * Jia Ma, Samsung Research America, Sep 2015
+ * Copyright (C) 2015 Jia Ma.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#if defined(CONFIG_DEBUG_FS)
+
+
+static int kaslr_debug_show(struct seq_file *m, void *private){
+ extern u64 *__boot_kernel_offset;
+ u64 *kernel_addr = (u64 *) &__boot_kernel_offset;
+
+ seq_printf(m, "0x%016llx..0x%016llx..\n",
+ (kernel_addr[0]+kernel_addr[1]),
+ kernel_addr[1]+kernel_addr[0]-kernel_addr[2]);
+
+
+ return 0 ;
+}
+
+
+static int kaslr_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, kaslr_debug_show, inode->i_private);
+}
+
+static const struct file_operations kaslr_debug_fops = {
+ .open = kaslr_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init kaslr_init_debugfs(void)
+{
+ struct dentry *root = debugfs_create_dir("kaslr", NULL);
+ if (!root)
+ return -ENXIO;
+ debugfs_create_file("kaslr", S_IRUSR, root, NULL , &kaslr_debug_fops);
+
+ return 0;
+}
+
+
+__initcall(kaslr_init_debugfs);
+
+#endif /*CONFIG_DEBUG_FS*/
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
old mode 100644
new mode 100755
diff --git a/kernel/kcov.c b/kernel/kcov.c
new file mode 100755
index 0000000..5813e93
--- /dev/null
+++ b/kernel/kcov.c
@@ -0,0 +1,431 @@
+#define pr_fmt(fmt) "kcov: " fmt
+
+#define DISABLE_BRANCH_PROFILING
+#include <linux/atomic.h>
+#include <linux/compiler.h>
+#include <linux/errno.h>
+#include <linux/export.h>
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/preempt.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/vmalloc.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/kcov.h>
+#include <asm/setup.h>
+
+/* Number of 64-bit words written per one comparison: */
+#define KCOV_WORDS_PER_CMP 4
+
+/*
+ * kcov descriptor (one per opened debugfs file).
+ * State transitions of the descriptor:
+ * - initial state after open()
+ * - then there must be a single ioctl(KCOV_INIT_TRACE) call
+ * - then, mmap() call (several calls are allowed but not useful)
+ * - then, ioctl(KCOV_ENABLE, arg), where arg is
+ * KCOV_TRACE_PC - to trace only the PCs
+ * or
+ * KCOV_TRACE_CMP - to trace only the comparison operands
+ * - then, ioctl(KCOV_DISABLE) to disable the task.
+ * Enabling/disabling ioctls can be repeated (only one task a time allowed).
+ */
+struct kcov {
+ /*
+ * Reference counter. We keep one for:
+ * - opened file descriptor
+ * - task with enabled coverage (we can't unwire it from another task)
+ */
+ atomic_t refcount;
+ /* The lock protects mode, size, area and t. */
+ spinlock_t lock;
+ enum kcov_mode mode;
+ /* Size of arena (in long's for KCOV_MODE_TRACE). */
+ unsigned size;
+ /* Coverage buffer shared with user space. */
+ void *area;
+ /* Task for which we collect coverage, or NULL. */
+ struct task_struct *t;
+};
+
+static bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t)
+{
+ enum kcov_mode mode;
+
+ /*
+ * We are interested in code coverage as a function of a syscall inputs,
+ * so we ignore code executed in interrupts.
+ */
+ if (!in_task())
+ return false;
+ mode = READ_ONCE(t->kcov_mode);
+ /*
+ * There is some code that runs in interrupts but for which
+ * in_interrupt() returns false (e.g. preempt_schedule_irq()).
+ * READ_ONCE()/barrier() effectively provides load-acquire wrt
+ * interrupts, there are paired barrier()/WRITE_ONCE() in
+ * kcov_ioctl_locked().
+ */
+ barrier();
+ return mode == needed_mode;
+}
+
+static unsigned long canonicalize_ip(unsigned long ip)
+{
+#ifdef CONFIG_RANDOMIZE_BASE
+ ip -= kaslr_offset();
+#endif
+ return ip;
+}
+
+/*
+ * Entry point from instrumented code.
+ * This is called once per basic-block/edge.
+ */
+void notrace __sanitizer_cov_trace_pc(void)
+{
+ struct task_struct *t;
+ unsigned long *area;
+ unsigned long ip = canonicalize_ip(_RET_IP_);
+ unsigned long pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_PC, t))
+ return;
+
+ area = t->kcov_area;
+ /* The first 64-bit word is the number of subsequent PCs. */
+ pos = READ_ONCE(area[0]) + 1;
+ if (likely(pos < t->kcov_size)) {
+ area[pos] = ip;
+ WRITE_ONCE(area[0], pos);
+ }
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_pc);
+
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+static void write_comp_data(u64 type, u64 arg1, u64 arg2, u64 ip)
+{
+ struct task_struct *t;
+ u64 *area;
+ u64 count, start_index, end_pos, max_pos;
+
+ t = current;
+ if (!check_kcov_mode(KCOV_MODE_TRACE_CMP, t))
+ return;
+
+ ip = canonicalize_ip(ip);
+
+ /*
+ * We write all comparison arguments and types as u64.
+ * The buffer was allocated for t->kcov_size unsigned longs.
+ */
+ area = (u64 *)t->kcov_area;
+ max_pos = t->kcov_size * sizeof(unsigned long);
+
+ count = READ_ONCE(area[0]);
+
+ /* Every record is KCOV_WORDS_PER_CMP 64-bit words. */
+ start_index = 1 + count * KCOV_WORDS_PER_CMP;
+ end_pos = (start_index + KCOV_WORDS_PER_CMP) * sizeof(u64);
+ if (likely(end_pos <= max_pos)) {
+ area[start_index] = type;
+ area[start_index + 1] = arg1;
+ area[start_index + 2] = arg2;
+ area[start_index + 3] = ip;
+ WRITE_ONCE(area[0], count + 1);
+ }
+}
+
+void notrace __sanitizer_cov_trace_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp1);
+
+void notrace __sanitizer_cov_trace_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp2);
+
+void notrace __sanitizer_cov_trace_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp4);
+
+void notrace __sanitizer_cov_trace_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3), arg1, arg2, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_cmp8);
+
+void notrace __sanitizer_cov_trace_const_cmp1(u8 arg1, u8 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(0) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp1);
+
+void notrace __sanitizer_cov_trace_const_cmp2(u16 arg1, u16 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(1) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp2);
+
+void notrace __sanitizer_cov_trace_const_cmp4(u32 arg1, u32 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(2) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp4);
+
+void notrace __sanitizer_cov_trace_const_cmp8(u64 arg1, u64 arg2)
+{
+ write_comp_data(KCOV_CMP_SIZE(3) | KCOV_CMP_CONST, arg1, arg2,
+ _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_const_cmp8);
+
+void notrace __sanitizer_cov_trace_switch(u64 val, u64 *cases)
+{
+ u64 i;
+ u64 count = cases[0];
+ u64 size = cases[1];
+ u64 type = KCOV_CMP_CONST;
+
+ switch (size) {
+ case 8:
+ type |= KCOV_CMP_SIZE(0);
+ break;
+ case 16:
+ type |= KCOV_CMP_SIZE(1);
+ break;
+ case 32:
+ type |= KCOV_CMP_SIZE(2);
+ break;
+ case 64:
+ type |= KCOV_CMP_SIZE(3);
+ break;
+ default:
+ return;
+ }
+ for (i = 0; i < count; i++)
+ write_comp_data(type, cases[i + 2], val, _RET_IP_);
+}
+EXPORT_SYMBOL(__sanitizer_cov_trace_switch);
+#endif /* ifdef CONFIG_KCOV_ENABLE_COMPARISONS */
+
+static void kcov_get(struct kcov *kcov)
+{
+ atomic_inc(&kcov->refcount);
+}
+
+static void kcov_put(struct kcov *kcov)
+{
+ if (atomic_dec_and_test(&kcov->refcount)) {
+ vfree(kcov->area);
+ kfree(kcov);
+ }
+}
+
+void kcov_task_init(struct task_struct *t)
+{
+ t->kcov_mode = KCOV_MODE_DISABLED;
+ t->kcov_size = 0;
+ t->kcov_area = NULL;
+ t->kcov = NULL;
+}
+
+void kcov_task_exit(struct task_struct *t)
+{
+ struct kcov *kcov;
+
+ kcov = t->kcov;
+ if (kcov == NULL)
+ return;
+ spin_lock(&kcov->lock);
+ if (WARN_ON(kcov->t != t)) {
+ spin_unlock(&kcov->lock);
+ return;
+ }
+ /* Just to not leave dangling references behind. */
+ kcov_task_init(t);
+ kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
+ spin_unlock(&kcov->lock);
+ kcov_put(kcov);
+}
+
+static int kcov_mmap(struct file *filep, struct vm_area_struct *vma)
+{
+ int res = 0;
+ void *area;
+ struct kcov *kcov = vma->vm_file->private_data;
+ unsigned long size, off;
+ struct page *page;
+
+ area = vmalloc_user(vma->vm_end - vma->vm_start);
+ if (!area)
+ return -ENOMEM;
+
+ spin_lock(&kcov->lock);
+ size = kcov->size * sizeof(unsigned long);
+ if (kcov->mode != KCOV_MODE_INIT || vma->vm_pgoff != 0 ||
+ vma->vm_end - vma->vm_start != size) {
+ res = -EINVAL;
+ goto exit;
+ }
+ if (!kcov->area) {
+ kcov->area = area;
+ vma->vm_flags |= VM_DONTEXPAND;
+ spin_unlock(&kcov->lock);
+ for (off = 0; off < size; off += PAGE_SIZE) {
+ page = vmalloc_to_page(kcov->area + off);
+ if (vm_insert_page(vma, vma->vm_start + off, page))
+ WARN_ONCE(1, "vm_insert_page() failed");
+ }
+ return 0;
+ }
+exit:
+ spin_unlock(&kcov->lock);
+ vfree(area);
+ return res;
+}
+
+static int kcov_open(struct inode *inode, struct file *filep)
+{
+ struct kcov *kcov;
+
+ kcov = kzalloc(sizeof(*kcov), GFP_KERNEL);
+ if (!kcov)
+ return -ENOMEM;
+ kcov->mode = KCOV_MODE_DISABLED;
+ atomic_set(&kcov->refcount, 1);
+ spin_lock_init(&kcov->lock);
+ filep->private_data = kcov;
+ return nonseekable_open(inode, filep);
+}
+
+static int kcov_close(struct inode *inode, struct file *filep)
+{
+ kcov_put(filep->private_data);
+ return 0;
+}
+
+static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd,
+ unsigned long arg)
+{
+ struct task_struct *t;
+ unsigned long size, unused;
+
+ switch (cmd) {
+ case KCOV_INIT_TRACE:
+ /*
+ * Enable kcov in trace mode and setup buffer size.
+ * Must happen before anything else.
+ */
+ if (kcov->mode != KCOV_MODE_DISABLED)
+ return -EBUSY;
+ /*
+ * Size must be at least 2 to hold current position and one PC.
+ * Later we allocate size * sizeof(unsigned long) memory,
+ * that must not overflow.
+ */
+ size = arg;
+ if (size < 2 || size > INT_MAX / sizeof(unsigned long))
+ return -EINVAL;
+ kcov->size = size;
+ kcov->mode = KCOV_MODE_INIT;
+ return 0;
+ case KCOV_ENABLE:
+ /*
+ * Enable coverage for the current task.
+ * At this point user must have been enabled trace mode,
+ * and mmapped the file. Coverage collection is disabled only
+ * at task exit or voluntary by KCOV_DISABLE. After that it can
+ * be enabled for another task.
+ */
+ if (kcov->mode != KCOV_MODE_INIT || !kcov->area)
+ return -EINVAL;
+ if (kcov->t != NULL)
+ return -EBUSY;
+ if (arg == KCOV_TRACE_PC)
+ kcov->mode = KCOV_MODE_TRACE_PC;
+ else if (arg == KCOV_TRACE_CMP)
+#ifdef CONFIG_KCOV_ENABLE_COMPARISONS
+ kcov->mode = KCOV_MODE_TRACE_CMP;
+#else
+ return -ENOTSUPP;
+#endif
+ else
+ return -EINVAL;
+ t = current;
+ /* Cache in task struct for performance. */
+ t->kcov_size = kcov->size;
+ t->kcov_area = kcov->area;
+ /* See comment in check_kcov_mode(). */
+ barrier();
+ WRITE_ONCE(t->kcov_mode, kcov->mode);
+ t->kcov = kcov;
+ kcov->t = t;
+ /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */
+ kcov_get(kcov);
+ return 0;
+ case KCOV_DISABLE:
+ /* Disable coverage for the current task. */
+ unused = arg;
+ if (unused != 0 || current->kcov != kcov)
+ return -EINVAL;
+ t = current;
+ if (WARN_ON(kcov->t != t))
+ return -EINVAL;
+ kcov_task_init(t);
+ kcov->t = NULL;
+ kcov->mode = KCOV_MODE_INIT;
+ kcov_put(kcov);
+ return 0;
+ default:
+ return -ENOTTY;
+ }
+}
+
+static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
+{
+ struct kcov *kcov;
+ int res;
+
+ kcov = filep->private_data;
+ spin_lock(&kcov->lock);
+ res = kcov_ioctl_locked(kcov, cmd, arg);
+ spin_unlock(&kcov->lock);
+ return res;
+}
+
+static const struct file_operations kcov_fops = {
+ .open = kcov_open,
+ .unlocked_ioctl = kcov_ioctl,
+ .compat_ioctl = kcov_ioctl,
+ .mmap = kcov_mmap,
+ .release = kcov_close,
+};
+
+static int __init kcov_init(void)
+{
+ if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) {
+ pr_err("failed to create kcov in debugfs\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+device_initcall(kcov_init);
diff --git a/kernel/kexec.c b/kernel/kexec.c
old mode 100644
new mode 100755
diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c
old mode 100644
new mode 100755
diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
old mode 100644
new mode 100755
diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h
old mode 100644
new mode 100755
diff --git a/kernel/kmod.c b/kernel/kmod.c
old mode 100644
new mode 100755
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
old mode 100644
new mode 100755
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c
old mode 100644
new mode 100755
diff --git a/kernel/kthread.c b/kernel/kthread.c
old mode 100644
new mode 100755
index ac6849e..f529aaf
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -65,7 +65,7 @@
static struct kthread *to_live_kthread(struct task_struct *k)
{
struct completion *vfork = ACCESS_ONCE(k->vfork_done);
- if (likely(vfork))
+ if (likely(vfork) && try_get_task_stack(k))
return __to_kthread(vfork);
return NULL;
}
@@ -433,8 +433,10 @@
{
struct kthread *kthread = to_live_kthread(k);
- if (kthread)
+ if (kthread) {
__kthread_unpark(k, kthread);
+ put_task_stack(k);
+ }
}
EXPORT_SYMBOL_GPL(kthread_unpark);
@@ -463,6 +465,7 @@
wait_for_completion(&kthread->parked);
}
}
+ put_task_stack(k);
ret = 0;
}
return ret;
@@ -498,6 +501,7 @@
__kthread_unpark(k, kthread);
wake_up_process(k);
wait_for_completion(&kthread->exited);
+ put_task_stack(k);
}
ret = k->exit_code;
put_task_struct(k);
@@ -610,6 +614,19 @@
}
EXPORT_SYMBOL_GPL(kthread_worker_fn);
+/*
+ * Returns true when the work could not be queued at the moment.
+ * It happens when it is already pending in a worker list
+ * or when it is being cancelled.
+ */
+static inline bool queuing_blocked(struct kthread_worker *worker,
+ struct kthread_work *work)
+{
+ lockdep_assert_held(&worker->lock);
+
+ return !list_empty(&work->node) || work->canceling;
+}
+
/* insert @work before @pos in @worker */
static void insert_kthread_work(struct kthread_worker *worker,
struct kthread_work *work,
@@ -639,7 +656,7 @@
unsigned long flags;
spin_lock_irqsave(&worker->lock, flags);
- if (list_empty(&work->node)) {
+ if (!queuing_blocked(worker, work)) {
insert_kthread_work(worker, work, &worker->work_list);
ret = true;
}
@@ -700,6 +717,87 @@
}
EXPORT_SYMBOL_GPL(flush_kthread_work);
+/*
+ * This function removes the work from the worker queue. Also it makes sure
+ * that it won't get queued later via the delayed work's timer.
+ *
+ * The work might still be in use when this function finishes. See the
+ * current_work proceed by the worker.
+ *
+ * Return: %true if @work was pending and successfully canceled,
+ * %false if @work was not pending
+ */
+static bool __kthread_cancel_work(struct kthread_work *work,
+ unsigned long *flags)
+{
+ /*
+ * Try to remove the work from a worker list. It might either
+ * be from worker->work_list or from worker->delayed_work_list.
+ */
+ if (!list_empty(&work->node)) {
+ list_del_init(&work->node);
+ return true;
+ }
+
+ return false;
+}
+
+static bool __kthread_cancel_work_sync(struct kthread_work *work)
+{
+ struct kthread_worker *worker = work->worker;
+ unsigned long flags;
+ int ret = false;
+
+ if (!worker)
+ goto out;
+
+ spin_lock_irqsave(&worker->lock, flags);
+ /* Work must not be used with >1 worker, see kthread_queue_work(). */
+ WARN_ON_ONCE(work->worker != worker);
+
+ ret = __kthread_cancel_work(work, &flags);
+
+ if (worker->current_work != work)
+ goto out_fast;
+
+ /*
+ * The work is in progress and we need to wait with the lock released.
+ * In the meantime, block any queuing by setting the canceling counter.
+ */
+ work->canceling++;
+ spin_unlock_irqrestore(&worker->lock, flags);
+ flush_kthread_work(work);
+ spin_lock_irqsave(&worker->lock, flags);
+ work->canceling--;
+
+out_fast:
+ spin_unlock_irqrestore(&worker->lock, flags);
+out:
+ return ret;
+}
+
+/**
+ * kthread_cancel_work_sync - cancel a kthread work and wait for it to finish
+ * @work: the kthread work to cancel
+ *
+ * Cancel @work and wait for its execution to finish. This function
+ * can be used even if the work re-queues itself. On return from this
+ * function, @work is guaranteed to be not pending or executing on any CPU.
+ *
+ * kthread_cancel_work_sync(&delayed_work->work) must not be used for
+ * delayed_work's. Use kthread_cancel_delayed_work_sync() instead.
+ *
+ * The caller must ensure that the worker on which @work was last
+ * queued can't be destroyed before this function returns.
+ *
+ * Return: %true if @work was pending, %false otherwise.
+ */
+bool kthread_cancel_work_sync(struct kthread_work *work)
+{
+ return __kthread_cancel_work_sync(work);
+}
+EXPORT_SYMBOL_GPL(kthread_cancel_work_sync);
+
/**
* flush_kthread_worker - flush all current works on a kthread_worker
* @worker: worker to flush
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
old mode 100644
new mode 100755
diff --git a/kernel/livepatch/Kconfig b/kernel/livepatch/Kconfig
old mode 100644
new mode 100755
diff --git a/kernel/livepatch/Makefile b/kernel/livepatch/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile
old mode 100644
new mode 100755
index 8e96f6c..31322a4
--- a/kernel/locking/Makefile
+++ b/kernel/locking/Makefile
@@ -1,3 +1,6 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o
diff --git a/kernel/locking/lglock.c b/kernel/locking/lglock.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/lockdep_internals.h b/kernel/locking/lockdep_internals.h
old mode 100644
new mode 100755
diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/lockdep_states.h b/kernel/locking/lockdep_states.h
old mode 100644
new mode 100755
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h
old mode 100644
new mode 100755
diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c
old mode 100644
new mode 100755
index 3ef3736..9c951fa
--- a/kernel/locking/mutex-debug.c
+++ b/kernel/locking/mutex-debug.c
@@ -49,21 +49,21 @@
}
void debug_mutex_add_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti)
+ struct task_struct *task)
{
SMP_DEBUG_LOCKS_WARN_ON(!spin_is_locked(&lock->wait_lock));
/* Mark the current thread as blocked on the lock: */
- ti->task->blocked_on = waiter;
+ task->blocked_on = waiter;
}
void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti)
+ struct task_struct *task)
{
DEBUG_LOCKS_WARN_ON(list_empty(&waiter->list));
- DEBUG_LOCKS_WARN_ON(waiter->task != ti->task);
- DEBUG_LOCKS_WARN_ON(ti->task->blocked_on != waiter);
- ti->task->blocked_on = NULL;
+ DEBUG_LOCKS_WARN_ON(waiter->task != task);
+ DEBUG_LOCKS_WARN_ON(task->blocked_on != waiter);
+ task->blocked_on = NULL;
list_del_init(&waiter->list);
waiter->task = NULL;
diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h
old mode 100644
new mode 100755
index 0799fd3..d06ae3b
--- a/kernel/locking/mutex-debug.h
+++ b/kernel/locking/mutex-debug.h
@@ -20,9 +20,9 @@
extern void debug_mutex_free_waiter(struct mutex_waiter *waiter);
extern void debug_mutex_add_waiter(struct mutex *lock,
struct mutex_waiter *waiter,
- struct thread_info *ti);
+ struct task_struct *task);
extern void mutex_remove_waiter(struct mutex *lock, struct mutex_waiter *waiter,
- struct thread_info *ti);
+ struct task_struct *task);
extern void debug_mutex_unlock(struct mutex *lock);
extern void debug_mutex_init(struct mutex *lock, const char *name,
struct lock_class_key *key);
diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c
old mode 100644
new mode 100755
index 79d2d76..a70b90d
--- a/kernel/locking/mutex.c
+++ b/kernel/locking/mutex.c
@@ -537,7 +537,7 @@
goto skip_wait;
debug_mutex_lock_common(lock, &waiter);
- debug_mutex_add_waiter(lock, &waiter, task_thread_info(task));
+ debug_mutex_add_waiter(lock, &waiter, task);
/* add waiting tasks to the end of the waitqueue (FIFO): */
list_add_tail(&waiter.list, &lock->wait_list);
@@ -584,7 +584,7 @@
}
__set_task_state(task, TASK_RUNNING);
- mutex_remove_waiter(lock, &waiter, current_thread_info());
+ mutex_remove_waiter(lock, &waiter, task);
/* set it to 0 if there are no waiters left: */
if (likely(list_empty(&lock->wait_list)))
atomic_set(&lock->count, 0);
@@ -605,7 +605,7 @@
return 0;
err:
- mutex_remove_waiter(lock, &waiter, task_thread_info(task));
+ mutex_remove_waiter(lock, &waiter, task);
spin_unlock_mutex(&lock->wait_lock, flags);
debug_mutex_free_waiter(&waiter);
mutex_release(&lock->dep_map, 1, ip);
diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h
old mode 100644
new mode 100755
index 5cda397..a68bae5
--- a/kernel/locking/mutex.h
+++ b/kernel/locking/mutex.h
@@ -13,7 +13,7 @@
do { spin_lock(lock); (void)(flags); } while (0)
#define spin_unlock_mutex(lock, flags) \
do { spin_unlock(lock); (void)(flags); } while (0)
-#define mutex_remove_waiter(lock, waiter, ti) \
+#define mutex_remove_waiter(lock, waiter, task) \
__list_del((waiter)->list.prev, (waiter)->list.next)
#ifdef CONFIG_MUTEX_SPIN_ON_OWNER
diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c
old mode 100644
new mode 100755
index f231e0b..ce18259
--- a/kernel/locking/percpu-rwsem.c
+++ b/kernel/locking/percpu-rwsem.c
@@ -8,151 +8,186 @@
#include <linux/sched.h>
#include <linux/errno.h>
-int __percpu_init_rwsem(struct percpu_rw_semaphore *brw,
+int __percpu_init_rwsem(struct percpu_rw_semaphore *sem,
const char *name, struct lock_class_key *rwsem_key)
{
- brw->fast_read_ctr = alloc_percpu(int);
- if (unlikely(!brw->fast_read_ctr))
+ sem->read_count = alloc_percpu(int);
+ if (unlikely(!sem->read_count))
return -ENOMEM;
/* ->rw_sem represents the whole percpu_rw_semaphore for lockdep */
- __init_rwsem(&brw->rw_sem, name, rwsem_key);
- rcu_sync_init(&brw->rss, RCU_SCHED_SYNC);
- atomic_set(&brw->slow_read_ctr, 0);
- init_waitqueue_head(&brw->write_waitq);
+ rcu_sync_init(&sem->rss, RCU_SCHED_SYNC);
+ __init_rwsem(&sem->rw_sem, name, rwsem_key);
+ init_waitqueue_head(&sem->writer);
+ sem->readers_block = 0;
return 0;
}
EXPORT_SYMBOL_GPL(__percpu_init_rwsem);
-void percpu_free_rwsem(struct percpu_rw_semaphore *brw)
+void percpu_free_rwsem(struct percpu_rw_semaphore *sem)
{
/*
* XXX: temporary kludge. The error path in alloc_super()
* assumes that percpu_free_rwsem() is safe after kzalloc().
*/
- if (!brw->fast_read_ctr)
+ if (!sem->read_count)
return;
- rcu_sync_dtor(&brw->rss);
- free_percpu(brw->fast_read_ctr);
- brw->fast_read_ctr = NULL; /* catch use after free bugs */
+ rcu_sync_dtor(&sem->rss);
+ free_percpu(sem->read_count);
+ sem->read_count = NULL; /* catch use after free bugs */
}
+EXPORT_SYMBOL_GPL(percpu_free_rwsem);
-/*
- * This is the fast-path for down_read/up_read. If it succeeds we rely
- * on the barriers provided by rcu_sync_enter/exit; see the comments in
- * percpu_down_write() and percpu_up_write().
- *
- * If this helper fails the callers rely on the normal rw_semaphore and
- * atomic_dec_and_test(), so in this case we have the necessary barriers.
- */
-static bool update_fast_ctr(struct percpu_rw_semaphore *brw, unsigned int val)
-{
- bool success;
-
- preempt_disable();
- success = rcu_sync_is_idle(&brw->rss);
- if (likely(success))
- __this_cpu_add(*brw->fast_read_ctr, val);
- preempt_enable();
-
- return success;
-}
-
-/*
- * Like the normal down_read() this is not recursive, the writer can
- * come after the first percpu_down_read() and create the deadlock.
- *
- * Note: returns with lock_is_held(brw->rw_sem) == T for lockdep,
- * percpu_up_read() does rwsem_release(). This pairs with the usage
- * of ->rw_sem in percpu_down/up_write().
- */
-void percpu_down_read(struct percpu_rw_semaphore *brw)
-{
- might_sleep();
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 0, _RET_IP_);
-
- if (likely(update_fast_ctr(brw, +1)))
- return;
-
- /* Avoid rwsem_acquire_read() and rwsem_release() */
- __down_read(&brw->rw_sem);
- atomic_inc(&brw->slow_read_ctr);
- __up_read(&brw->rw_sem);
-}
-EXPORT_SYMBOL_GPL(percpu_down_read);
-
-int percpu_down_read_trylock(struct percpu_rw_semaphore *brw)
-{
- if (unlikely(!update_fast_ctr(brw, +1))) {
- if (!__down_read_trylock(&brw->rw_sem))
- return 0;
- atomic_inc(&brw->slow_read_ctr);
- __up_read(&brw->rw_sem);
- }
-
- rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_);
- return 1;
-}
-
-void percpu_up_read(struct percpu_rw_semaphore *brw)
-{
- rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_);
-
- if (likely(update_fast_ctr(brw, -1)))
- return;
-
- /* false-positive is possible but harmless */
- if (atomic_dec_and_test(&brw->slow_read_ctr))
- wake_up_all(&brw->write_waitq);
-}
-EXPORT_SYMBOL_GPL(percpu_up_read);
-
-static int clear_fast_ctr(struct percpu_rw_semaphore *brw)
-{
- unsigned int sum = 0;
- int cpu;
-
- for_each_possible_cpu(cpu) {
- sum += per_cpu(*brw->fast_read_ctr, cpu);
- per_cpu(*brw->fast_read_ctr, cpu) = 0;
- }
-
- return sum;
-}
-
-void percpu_down_write(struct percpu_rw_semaphore *brw)
+int __percpu_down_read(struct percpu_rw_semaphore *sem, int try)
{
/*
- * Make rcu_sync_is_idle() == F and thus disable the fast-path in
- * percpu_down_read() and percpu_up_read(), and wait for gp pass.
+ * Due to having preemption disabled the decrement happens on
+ * the same CPU as the increment, avoiding the
+ * increment-on-one-CPU-and-decrement-on-another problem.
*
- * The latter synchronises us with the preceding readers which used
- * the fast-past, so we can not miss the result of __this_cpu_add()
- * or anything else inside their criticial sections.
+ * If the reader misses the writer's assignment of readers_block, then
+ * the writer is guaranteed to see the reader's increment.
+ *
+ * Conversely, any readers that increment their sem->read_count after
+ * the writer looks are guaranteed to see the readers_block value,
+ * which in turn means that they are guaranteed to immediately
+ * decrement their sem->read_count, so that it doesn't matter that the
+ * writer missed them.
*/
- rcu_sync_enter(&brw->rss);
- /* exclude other writers, and block the new readers completely */
- down_write(&brw->rw_sem);
+ smp_mb(); /* A matches D */
- /* nobody can use fast_read_ctr, move its sum into slow_read_ctr */
- atomic_add(clear_fast_ctr(brw), &brw->slow_read_ctr);
+ /*
+ * If !readers_block the critical section starts here, matched by the
+ * release in percpu_up_write().
+ */
+ if (likely(!smp_load_acquire(&sem->readers_block)))
+ return 1;
- /* wait for all readers to complete their percpu_up_read() */
- wait_event(brw->write_waitq, !atomic_read(&brw->slow_read_ctr));
+ /*
+ * Per the above comment; we still have preemption disabled and
+ * will thus decrement on the same CPU as we incremented.
+ */
+ __percpu_up_read(sem);
+
+ if (try)
+ return 0;
+
+ /*
+ * We either call schedule() in the wait, or we'll fall through
+ * and reschedule on the preempt_enable() in percpu_down_read().
+ */
+ preempt_enable_no_resched();
+
+ /*
+ * Avoid lockdep for the down/up_read() we already have them.
+ */
+ __down_read(&sem->rw_sem);
+ this_cpu_inc(*sem->read_count);
+ __up_read(&sem->rw_sem);
+
+ preempt_disable();
+ return 1;
+}
+EXPORT_SYMBOL_GPL(__percpu_down_read);
+
+void __percpu_up_read(struct percpu_rw_semaphore *sem)
+{
+ smp_mb(); /* B matches C */
+ /*
+ * In other words, if they see our decrement (presumably to aggregate
+ * zero, as that is the only time it matters) they will also see our
+ * critical section.
+ */
+ __this_cpu_dec(*sem->read_count);
+
+ /* Prod writer to recheck readers_active */
+ wake_up(&sem->writer);
+}
+EXPORT_SYMBOL_GPL(__percpu_up_read);
+
+#define per_cpu_sum(var) \
+({ \
+ typeof(var) __sum = 0; \
+ int cpu; \
+ compiletime_assert_atomic_type(__sum); \
+ for_each_possible_cpu(cpu) \
+ __sum += per_cpu(var, cpu); \
+ __sum; \
+})
+
+/*
+ * Return true if the modular sum of the sem->read_count per-CPU variable is
+ * zero. If this sum is zero, then it is stable due to the fact that if any
+ * newly arriving readers increment a given counter, they will immediately
+ * decrement that same counter.
+ */
+static bool readers_active_check(struct percpu_rw_semaphore *sem)
+{
+ if (per_cpu_sum(*sem->read_count) != 0)
+ return false;
+
+ /*
+ * If we observed the decrement; ensure we see the entire critical
+ * section.
+ */
+
+ smp_mb(); /* C matches B */
+
+ return true;
+}
+
+void percpu_down_write(struct percpu_rw_semaphore *sem)
+{
+ /* Notify readers to take the slow path. */
+ rcu_sync_enter(&sem->rss);
+
+ down_write(&sem->rw_sem);
+
+ /*
+ * Notify new readers to block; up until now, and thus throughout the
+ * longish rcu_sync_enter() above, new readers could still come in.
+ */
+ WRITE_ONCE(sem->readers_block, 1);
+
+ smp_mb(); /* D matches A */
+
+ /*
+ * If they don't see our writer of readers_block, then we are
+ * guaranteed to see their sem->read_count increment, and therefore
+ * will wait for them.
+ */
+
+ /* Wait for all now active readers to complete. */
+ wait_event(sem->writer, readers_active_check(sem));
}
EXPORT_SYMBOL_GPL(percpu_down_write);
-void percpu_up_write(struct percpu_rw_semaphore *brw)
+void percpu_up_write(struct percpu_rw_semaphore *sem)
{
- /* release the lock, but the readers can't use the fast-path */
- up_write(&brw->rw_sem);
/*
- * Enable the fast-path in percpu_down_read() and percpu_up_read()
- * but only after another gp pass; this adds the necessary barrier
- * to ensure the reader can't miss the changes done by us.
+ * Signal the writer is done, no fast path yet.
+ *
+ * One reason that we cannot just immediately flip to readers_fast is
+ * that new readers might fail to see the results of this writer's
+ * critical section.
+ *
+ * Therefore we force it through the slow path which guarantees an
+ * acquire and thereby guarantees the critical section's consistency.
*/
- rcu_sync_exit(&brw->rss);
+ smp_store_release(&sem->readers_block, 0);
+
+ /*
+ * Release the write lock, this will allow readers back in the game.
+ */
+ up_write(&sem->rw_sem);
+
+ /*
+ * Once this completes (at least one RCU-sched grace period hence) the
+ * reader fast path will be available again. Safe to use outside the
+ * exclusive write lock because its counting.
+ */
+ rcu_sync_exit(&sem->rss);
}
EXPORT_SYMBOL_GPL(percpu_up_write);
diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
old mode 100644
new mode 100755
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
old mode 100644
new mode 100755
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
old mode 100644
new mode 100755
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
old mode 100644
new mode 100755
diff --git a/kernel/locking/rwsem-spinlock.c b/kernel/locking/rwsem-spinlock.c
old mode 100644
new mode 100755
index 3a50485..1591f6b
--- a/kernel/locking/rwsem-spinlock.c
+++ b/kernel/locking/rwsem-spinlock.c
@@ -191,11 +191,12 @@
/*
* get a write lock on the semaphore
*/
-void __sched __down_write_nested(struct rw_semaphore *sem, int subclass)
+int __sched __down_write_common(struct rw_semaphore *sem, int state)
{
struct rwsem_waiter waiter;
struct task_struct *tsk;
unsigned long flags;
+ int ret = 0;
raw_spin_lock_irqsave(&sem->wait_lock, flags);
@@ -215,21 +216,33 @@
*/
if (sem->count == 0)
break;
- set_task_state(tsk, TASK_UNINTERRUPTIBLE);
+ if (signal_pending_state(state, current)) {
+ ret = -EINTR;
+ goto out;
+ }
+ set_task_state(tsk, state);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
schedule();
raw_spin_lock_irqsave(&sem->wait_lock, flags);
}
/* got the lock */
sem->count = -1;
+out:
list_del(&waiter.list);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+
+ return ret;
}
void __sched __down_write(struct rw_semaphore *sem)
{
- __down_write_nested(sem, 0);
+ __down_write_common(sem, TASK_UNINTERRUPTIBLE);
+}
+
+int __sched __down_write_killable(struct rw_semaphore *sem)
+{
+ return __down_write_common(sem, TASK_KILLABLE);
}
/*
diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c
old mode 100644
new mode 100755
index 1be33ca..1d02b4bd
--- a/kernel/locking/rwsem-xadd.c
+++ b/kernel/locking/rwsem-xadd.c
@@ -80,7 +80,7 @@
debug_check_no_locks_freed((void *)sem, sizeof(*sem));
lockdep_init_map(&sem->dep_map, name, key, 0);
#endif
- sem->count = RWSEM_UNLOCKED_VALUE;
+ atomic_long_set(&sem->count, RWSEM_UNLOCKED_VALUE);
raw_spin_lock_init(&sem->wait_lock);
INIT_LIST_HEAD(&sem->wait_list);
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
@@ -114,12 +114,16 @@
* - the 'active part' of count (&0x0000ffff) reached 0 (but may have changed)
* - the 'waiting part' of count (&0xffff0000) is -ve (and will still be so)
* - there must be someone on the queue
- * - the spinlock must be held by the caller
+ * - the wait_lock must be held by the caller
+ * - tasks are marked for wakeup, the caller must later invoke wake_up_q()
+ * to actually wakeup the blocked task(s) and drop the reference count,
+ * preferably when the wait_lock is released
* - woken process blocks are discarded from the list after having task zeroed
- * - writers are only woken if downgrading is false
+ * - writers are only marked woken if downgrading is false
*/
static struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, enum rwsem_wake_type wake_type)
+__rwsem_mark_wake(struct rw_semaphore *sem,
+ enum rwsem_wake_type wake_type, struct wake_q_head *wake_q)
{
struct rwsem_waiter *waiter;
struct task_struct *tsk;
@@ -128,13 +132,16 @@
waiter = list_entry(sem->wait_list.next, struct rwsem_waiter, list);
if (waiter->type == RWSEM_WAITING_FOR_WRITE) {
- if (wake_type == RWSEM_WAKE_ANY)
- /* Wake writer at the front of the queue, but do not
- * grant it the lock yet as we want other writers
- * to be able to steal it. Readers, on the other hand,
- * will block as they will notice the queued writer.
+ if (wake_type == RWSEM_WAKE_ANY) {
+ /*
+ * Mark writer at the front of the queue for wakeup.
+ * Until the task is actually later awoken later by
+ * the caller, other writers are able to steal it.
+ * Readers, on the other hand, will block as they
+ * will notice the queued writer.
*/
- wake_up_process(waiter->task);
+ wake_q_add(wake_q, waiter->task);
+ }
goto out;
}
@@ -146,15 +153,27 @@
if (wake_type != RWSEM_WAKE_READ_OWNED) {
adjustment = RWSEM_ACTIVE_READ_BIAS;
try_reader_grant:
- oldcount = rwsem_atomic_update(adjustment, sem) - adjustment;
+ oldcount = atomic_long_add_return(adjustment, &sem->count) - adjustment;
+
if (unlikely(oldcount < RWSEM_WAITING_BIAS)) {
- /* A writer stole the lock. Undo our reader grant. */
- if (rwsem_atomic_update(-adjustment, sem) &
- RWSEM_ACTIVE_MASK)
+ /*
+ * If the count is still less than RWSEM_WAITING_BIAS
+ * after removing the adjustment, it is assumed that
+ * a writer has stolen the lock. We have to undo our
+ * reader grant.
+ */
+ if (atomic_long_add_return(-adjustment, &sem->count) <
+ RWSEM_WAITING_BIAS)
goto out;
/* Last active locker left. Retry waking readers. */
goto try_reader_grant;
}
+ /*
+ * It is not really necessary to set it to reader-owned here,
+ * but it gives the spinners an early indication that the
+ * readers now have the lock.
+ */
+ rwsem_set_reader_owned(sem);
}
/* Grant an infinite number of read locks to the readers at the front
@@ -179,7 +198,7 @@
adjustment -= RWSEM_WAITING_BIAS;
if (adjustment)
- rwsem_atomic_add(adjustment, sem);
+ atomic_long_add(adjustment, &sem->count);
next = sem->wait_list.next;
loop = woken;
@@ -187,17 +206,15 @@
waiter = list_entry(next, struct rwsem_waiter, list);
next = waiter->list.next;
tsk = waiter->task;
+
+ wake_q_add(wake_q, tsk);
/*
- * Make sure we do not wakeup the next reader before
- * setting the nil condition to grant the next reader;
- * otherwise we could miss the wakeup on the other
- * side and end up sleeping again. See the pairing
- * in rwsem_down_read_failed().
+ * Ensure that the last operation is setting the reader
+ * waiter to nil such that rwsem_down_read_failed() cannot
+ * race with do_exit() by always holding a reference count
+ * to the task to wakeup.
*/
- smp_mb();
- waiter->task = NULL;
- wake_up_process(tsk);
- put_task_struct(tsk);
+ smp_store_release(&waiter->task, NULL);
} while (--loop);
sem->wait_list.next = next;
@@ -216,11 +233,11 @@
long count, adjustment = -RWSEM_ACTIVE_READ_BIAS;
struct rwsem_waiter waiter;
struct task_struct *tsk = current;
+ WAKE_Q(wake_q);
/* set up my own style of waitqueue */
waiter.task = tsk;
waiter.type = RWSEM_WAITING_FOR_READ;
- get_task_struct(tsk);
raw_spin_lock_irq(&sem->wait_lock);
if (list_empty(&sem->wait_list))
@@ -228,7 +245,7 @@
list_add_tail(&waiter.list, &sem->wait_list);
/* we're now waiting on the lock, but no longer actively locking */
- count = rwsem_atomic_update(adjustment, sem);
+ count = atomic_long_add_return(adjustment, &sem->count);
/* If there are no active locks, wake the front queued process(es).
*
@@ -238,9 +255,10 @@
if (count == RWSEM_WAITING_BIAS ||
(count > RWSEM_WAITING_BIAS &&
adjustment != -RWSEM_ACTIVE_READ_BIAS))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
/* wait to be given the lock */
while (true) {
@@ -255,17 +273,29 @@
}
EXPORT_SYMBOL(rwsem_down_read_failed);
+/*
+ * This function must be called with the sem->wait_lock held to prevent
+ * race conditions between checking the rwsem wait list and setting the
+ * sem->count accordingly.
+ */
static inline bool rwsem_try_write_lock(long count, struct rw_semaphore *sem)
{
/*
- * Try acquiring the write lock. Check count first in order
- * to reduce unnecessary expensive cmpxchg() operations.
+ * Avoid trying to acquire write lock if count isn't RWSEM_WAITING_BIAS.
*/
- if (count == RWSEM_WAITING_BIAS &&
- cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS,
- RWSEM_ACTIVE_WRITE_BIAS) == RWSEM_WAITING_BIAS) {
- if (!list_is_singular(&sem->wait_list))
- rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ if (count != RWSEM_WAITING_BIAS)
+ return false;
+
+ /*
+ * Acquire the lock by trying to set it to ACTIVE_WRITE_BIAS. If there
+ * are other tasks on the wait list, we need to add on WAITING_BIAS.
+ */
+ count = list_is_singular(&sem->wait_list) ?
+ RWSEM_ACTIVE_WRITE_BIAS :
+ RWSEM_ACTIVE_WRITE_BIAS + RWSEM_WAITING_BIAS;
+
+ if (atomic_long_cmpxchg_acquire(&sem->count, RWSEM_WAITING_BIAS, count)
+ == RWSEM_WAITING_BIAS) {
rwsem_set_owner(sem);
return true;
}
@@ -279,13 +309,13 @@
*/
static inline bool rwsem_try_write_lock_unqueued(struct rw_semaphore *sem)
{
- long old, count = READ_ONCE(sem->count);
+ long old, count = atomic_long_read(&sem->count);
while (true) {
if (!(count == 0 || count == RWSEM_WAITING_BIAS))
return false;
- old = cmpxchg_acquire(&sem->count, count,
+ old = atomic_long_cmpxchg_acquire(&sem->count, count,
count + RWSEM_ACTIVE_WRITE_BIAS);
if (old == count) {
rwsem_set_owner(sem);
@@ -306,16 +336,11 @@
rcu_read_lock();
owner = READ_ONCE(sem->owner);
- if (!owner) {
- long count = READ_ONCE(sem->count);
+ if (!rwsem_owner_is_writer(owner)) {
/*
- * If sem->owner is not set, yet we have just recently entered the
- * slowpath with the lock being active, then there is a possibility
- * reader(s) may have the lock. To be safe, bail spinning in these
- * situations.
+ * Don't spin if the rwsem is readers owned.
*/
- if (count & RWSEM_ACTIVE_MASK)
- ret = false;
+ ret = !rwsem_owner_is_reader(owner);
goto done;
}
@@ -325,10 +350,15 @@
return ret;
}
-static noinline
-bool rwsem_spin_on_owner(struct rw_semaphore *sem, struct task_struct *owner)
+/*
+ * Return true only if we can still spin on the owner field of the rwsem.
+ */
+static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem)
{
- long count;
+ struct task_struct *owner = READ_ONCE(sem->owner);
+
+ if (!rwsem_owner_is_writer(owner))
+ goto out;
rcu_read_lock();
while (sem->owner == owner) {
@@ -349,22 +379,16 @@
cpu_relax_lowlatency();
}
rcu_read_unlock();
-
- if (READ_ONCE(sem->owner))
- return true; /* new owner, continue spinning */
-
+out:
/*
- * When the owner is not set, the lock could be free or
- * held by readers. Check the counter to verify the
- * state.
- */
- count = READ_ONCE(sem->count);
- return (count == 0 || count == RWSEM_WAITING_BIAS);
+ * If there is a new owner or the owner is not set, we continue
+ * spinning.
+ */
+ return !rwsem_owner_is_reader(READ_ONCE(sem->owner));
}
static bool rwsem_optimistic_spin(struct rw_semaphore *sem)
{
- struct task_struct *owner;
bool taken = false;
preempt_disable();
@@ -376,12 +400,17 @@
if (!osq_lock(&sem->osq))
goto done;
- while (true) {
- owner = READ_ONCE(sem->owner);
- if (owner && !rwsem_spin_on_owner(sem, owner))
- break;
-
- /* wait_lock will be acquired if write_lock is obtained */
+ /*
+ * Optimistically spin on the owner field and attempt to acquire the
+ * lock whenever the owner changes. Spinning will be stopped when:
+ * 1) the owning writer isn't running; or
+ * 2) readers own the lock as we can't determine if they are
+ * actively running or not.
+ */
+ while (rwsem_spin_on_owner(sem)) {
+ /*
+ * Try to acquire the lock
+ */
if (rwsem_try_write_lock_unqueued(sem)) {
taken = true;
break;
@@ -393,7 +422,7 @@
* we're an RT task that will live-lock because we won't let
* the owner complete.
*/
- if (!owner && (need_resched() || rt_task(current)))
+ if (!sem->owner && (need_resched() || rt_task(current)))
break;
/*
@@ -433,15 +462,17 @@
/*
* Wait until we successfully acquire the write lock
*/
-__visible
-struct rw_semaphore __sched *rwsem_down_write_failed(struct rw_semaphore *sem)
+static inline struct rw_semaphore *
+__rwsem_down_write_failed_common(struct rw_semaphore *sem, int state)
{
long count;
bool waiting = true; /* any queued threads before us */
struct rwsem_waiter waiter;
+ struct rw_semaphore *ret = sem;
+ WAKE_Q(wake_q);
/* undo write bias from down_write operation, stop active locking */
- count = rwsem_atomic_update(-RWSEM_ACTIVE_WRITE_BIAS, sem);
+ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count);
/* do optimistic spinning and steal lock if possible */
if (rwsem_optimistic_spin(sem))
@@ -464,21 +495,32 @@
/* we're now waiting on the lock, but no longer actively locking */
if (waiting) {
- count = READ_ONCE(sem->count);
+ count = atomic_long_read(&sem->count);
/*
* If there were already threads queued before us and there are
* no active writers, the lock must be read owned; so we try to
* wake any read locks that were queued ahead of us.
*/
- if (count > RWSEM_WAITING_BIAS)
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READERS);
+ if (count > RWSEM_WAITING_BIAS) {
+ WAKE_Q(wake_q);
+
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q);
+ /*
+ * The wakeup is normally called _after_ the wait_lock
+ * is released, but given that we are proactively waking
+ * readers we can deal with the wake_q overhead as it is
+ * similar to releasing and taking the wait_lock again
+ * for attempting rwsem_try_write_lock().
+ */
+ wake_up_q(&wake_q);
+ }
} else
- count = rwsem_atomic_update(RWSEM_WAITING_BIAS, sem);
+ count = atomic_long_add_return(RWSEM_WAITING_BIAS, &sem->count);
/* wait until we successfully acquire the lock */
- set_current_state(TASK_UNINTERRUPTIBLE);
+ set_current_state(state);
while (true) {
if (rwsem_try_write_lock(count, sem))
break;
@@ -486,21 +528,49 @@
/* Block until there are no active lockers. */
do {
+ if (signal_pending_state(state, current))
+ goto out_nolock;
+
schedule();
- set_current_state(TASK_UNINTERRUPTIBLE);
- } while ((count = sem->count) & RWSEM_ACTIVE_MASK);
+ set_current_state(state);
+ } while ((count = atomic_long_read(&sem->count)) & RWSEM_ACTIVE_MASK);
raw_spin_lock_irq(&sem->wait_lock);
}
__set_current_state(TASK_RUNNING);
-
list_del(&waiter.list);
raw_spin_unlock_irq(&sem->wait_lock);
- return sem;
+ return ret;
+
+out_nolock:
+ __set_current_state(TASK_RUNNING);
+ raw_spin_lock_irq(&sem->wait_lock);
+ list_del(&waiter.list);
+ if (list_empty(&sem->wait_list))
+ atomic_long_add(-RWSEM_WAITING_BIAS, &sem->count);
+ else
+ __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
+ raw_spin_unlock_irq(&sem->wait_lock);
+ wake_up_q(&wake_q);
+
+ return ERR_PTR(-EINTR);
+}
+
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed(struct rw_semaphore *sem)
+{
+ return __rwsem_down_write_failed_common(sem, TASK_UNINTERRUPTIBLE);
}
EXPORT_SYMBOL(rwsem_down_write_failed);
+__visible struct rw_semaphore * __sched
+rwsem_down_write_failed_killable(struct rw_semaphore *sem)
+{
+ return __rwsem_down_write_failed_common(sem, TASK_KILLABLE);
+}
+EXPORT_SYMBOL(rwsem_down_write_failed_killable);
+
/*
* handle waking up a waiter on the semaphore
* - up_read/up_write has decremented the active part of count if we come here
@@ -509,6 +579,7 @@
struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ WAKE_Q(wake_q);
/*
* __rwsem_down_write_failed_common(sem)
@@ -572,9 +643,10 @@
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_ANY);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_ANY, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
return sem;
}
@@ -589,14 +661,16 @@
struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem)
{
unsigned long flags;
+ WAKE_Q(wake_q);
raw_spin_lock_irqsave(&sem->wait_lock, flags);
/* do nothing if list empty */
if (!list_empty(&sem->wait_list))
- sem = __rwsem_do_wake(sem, RWSEM_WAKE_READ_OWNED);
+ sem = __rwsem_mark_wake(sem, RWSEM_WAKE_READ_OWNED, &wake_q);
raw_spin_unlock_irqrestore(&sem->wait_lock, flags);
+ wake_up_q(&wake_q);
return sem;
}
diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c
old mode 100644
new mode 100755
index 205be0c..65ec879
--- a/kernel/locking/rwsem.c
+++ b/kernel/locking/rwsem.c
@@ -22,6 +22,7 @@
rwsem_acquire_read(&sem->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read);
@@ -33,8 +34,10 @@
{
int ret = __down_read_trylock(sem);
- if (ret == 1)
+ if (ret == 1) {
rwsem_acquire_read(&sem->dep_map, 0, 1, _RET_IP_);
+ rwsem_set_reader_owned(sem);
+ }
return ret;
}
@@ -105,7 +108,7 @@
* lockdep: a downgraded write will live on as a write
* dependency.
*/
- rwsem_clear_owner(sem);
+ rwsem_set_reader_owned(sem);
__downgrade_write(sem);
}
@@ -119,6 +122,7 @@
rwsem_acquire_read(&sem->dep_map, subclass, 0, _RET_IP_);
LOCK_CONTENDED(sem, __down_read_trylock, __down_read);
+ rwsem_set_reader_owned(sem);
}
EXPORT_SYMBOL(down_read_nested);
diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h
old mode 100644
new mode 100755
index 870ed9a..439a1ce
--- a/kernel/locking/rwsem.h
+++ b/kernel/locking/rwsem.h
@@ -1,12 +1,56 @@
+/*
+ * The owner field of the rw_semaphore structure will be set to
+ * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear
+ * the owner field when it unlocks. A reader, on the other hand, will
+ * not touch the owner field when it unlocks.
+ *
+ * In essence, the owner field now has the following 3 states:
+ * 1) 0
+ * - lock is free or the owner hasn't set the field yet
+ * 2) RWSEM_READER_OWNED
+ * - lock is currently or previously owned by readers (lock is free
+ * or not set by owner yet)
+ * 3) Other non-zero value
+ * - a writer owns the lock
+ */
+#define RWSEM_READER_OWNED ((struct task_struct *)1UL)
+
#ifdef CONFIG_RWSEM_SPIN_ON_OWNER
static inline void rwsem_set_owner(struct rw_semaphore *sem)
{
- sem->owner = current;
+ WRITE_ONCE(sem->owner, current);
+ smp_wmb();
+ isb();
}
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
- sem->owner = NULL;
+ WRITE_ONCE(sem->owner, NULL);
+ smp_wmb();
+ isb();
+}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+ /*
+ * We check the owner value first to make sure that we will only
+ * do a write to the rwsem cacheline when it is really necessary
+ * to minimize cacheline contention.
+ */
+ if (sem->owner != RWSEM_READER_OWNED)
+ WRITE_ONCE(sem->owner, RWSEM_READER_OWNED);
+ smp_wmb();
+ isb();
+}
+
+static inline bool rwsem_owner_is_writer(struct task_struct *owner)
+{
+ return owner && owner != RWSEM_READER_OWNED;
+}
+
+static inline bool rwsem_owner_is_reader(struct task_struct *owner)
+{
+ return owner == RWSEM_READER_OWNED;
}
#else
@@ -17,4 +61,8 @@
static inline void rwsem_clear_owner(struct rw_semaphore *sem)
{
}
+
+static inline void rwsem_set_reader_owned(struct rw_semaphore *sem)
+{
+}
#endif
diff --git a/kernel/locking/semaphore.c b/kernel/locking/semaphore.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c
old mode 100644
new mode 100755
diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c
old mode 100644
new mode 100755
index 0374a59..c6ced79
--- a/kernel/locking/spinlock_debug.c
+++ b/kernel/locking/spinlock_debug.c
@@ -55,10 +55,10 @@
if (lock->owner && lock->owner != SPINLOCK_OWNER_INIT)
owner = lock->owner;
- printk(KERN_EMERG "BUG: spinlock %s on CPU#%d, %s/%d\n",
+ pr_auto(ASL8, "BUG: spinlock %s on CPU#%d, %s/%d\n",
msg, raw_smp_processor_id(),
current->comm, task_pid_nr(current));
- printk(KERN_EMERG " lock: %pS, .magic: %08x, .owner: %s/%d, "
+ pr_auto(ASL8, " lock: %pS, .magic: %08x, .owner: %s/%d, "
".owner_cpu: %d\n",
lock, lock->magic,
owner ? owner->comm : "<none>",
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
old mode 100644
new mode 100755
diff --git a/kernel/memremap.c b/kernel/memremap.c
old mode 100644
new mode 100755
diff --git a/kernel/module-internal.h b/kernel/module-internal.h
old mode 100644
new mode 100755
diff --git a/kernel/module.c b/kernel/module.c
old mode 100644
new mode 100755
index bcc78f4..5fd5e65
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -62,23 +62,126 @@
#include <uapi/linux/module.h>
#include "module-internal.h"
+#ifdef CONFIG_TIMA_LKMAUTH_CODE_PROT
+#include <asm/tlbflush.h>
+#endif/*CONFIG_TIMA_LKMAUTH_CODE_PROT*/
#define CREATE_TRACE_POINTS
#include <trace/events/module.h>
+#ifdef CONFIG_TIMA_LKMAUTH_CODE_PROT
+#define TIMA_SET_PTE_RO 1
+#define TIMA_SET_PTE_NX 2
+#endif/*CONFIG_TIMA_LKMAUTH_CODE_PROT*/
#ifndef ARCH_SHF_SMALL
#define ARCH_SHF_SMALL 0
#endif
+#ifdef CONFIG_TIMA_LKMAUTH
+/*
+ * TEE-dependent configurations
+ */
+#include <../drivers/gud/gud-exynos7885/MobiCoreDriver/public/mobicore_driver_api.h>
+
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+
+#define TL_TIMA_LKMAUTH_UUID {{ 0xff, 0xff, 0xff, 0xff, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0xb }}
+
+/* Commands for lkmauth tl */
+#define CMD_TIMA_LKMAUTH_LOAD_HASH 0x00000009
+#define CMD_TIMA_LKMAUTH_VERIFY_MODULE 0x00000010
+#define CMD_TIMA_LKMAUTH_LKM_BLOCK 0x00000011
+#define CMD_TIMA_LKMAUTH_UNKNOWN 0x7FFFFFFF
+
+/* Return codes for lkmauth tl */
+#define RET_TL_TIMA_LKMAUTH_OK 0x00000000
+#define RET_TL_TIMA_LKMAUTH_HASH_LOADED 0x00000001
+#define RET_TL_TIMA_LKMAUTH_LKM_BLOCK_FORCE 0x00000002
+
+/* Error codes for lkmauth tl */
+#define RET_TL_TIMA_LKMAUTH_LG_MAXKO 0x00000010
+#define RET_TL_TIMA_LKMAUTH_SHA1_INIT_FAIL 0x00000020
+#define RET_TL_TIMA_LKMAUTH_SHA1_FINAL_FAIL 0x00000030
+#define RET_TL_TIMA_LKMAUTH_VERIFY_FAIL 0x00000040
+
+/* Return codes for lkmauth function */
+#define RET_LKMAUTH_SUCCESS 0
+#define RET_LKMAUTH_FAIL -1
+
+#define HASH_SIZE 20
+#define TIMA_SIGN_LEN 256 /* the rsa signature length of lkm_sec_info */
+#define MC_MAPPING_MAX_SIZE 0x100000
+
+uint8_t *tci = NULL;
+uint8_t *drv_tci = NULL;
+uint8_t lkmauth_tl_loaded = 0;
+uint8_t lkm_sec_info_loaded = 0;
+struct mc_session_handle mchandle;
+struct mc_session_handle drv_mchandle;
+
+/*
+ * TEE-independent configurations
+ */
+#include <linux/kobject.h>
+DEFINE_MUTEX(lkmauth_mutex);
+extern struct device *tima_uevent_dev;
+
+/* Message types for the lkmauth command */
+typedef struct lkmauth_hash_s {
+ uint32_t cmd_id;
+ uint32_t hash_buf_start; /* starting address of buf for ko hashes */
+ uint32_t hash_buf_len; /* length of hash buf, should be multiples of 20 bytes */
+} __attribute__ ((packed)) lkmauth_hash_t;
+
+typedef struct lkmauth_req_s {
+ uint32_t cmd_id;
+ uint32_t module_addr_start;
+ uint32_t module_len;
+ uint32_t min;
+ uint32_t max;
+ char module_name[280];
+ int module_name_len;
+} __attribute__ ((packed)) lkmauth_req_t;
+
+typedef struct lkmauth_rsp_s {
+ /* First 4 bytes should always be command id */
+ uint32_t cmd_id;
+ int ret;
+ union {
+ unsigned char hash[HASH_SIZE];
+ char result_ondemand[256];
+ } __attribute__ ((packed)) result;
+} __attribute__ ((packed)) lkmauth_rsp_t;
+
+typedef struct {
+ union {
+ lkmauth_hash_t lkmauth_hash;
+ lkmauth_req_t lkmauth_req;
+ lkmauth_rsp_t lkmauth_rsp;
+ };
+} tciMessage_t;
+#endif /* End CONFIG_TIMA_LKMAUTH */
+
/*
* Modules' sections will be aligned on page boundaries
* to ensure complete separation of code and data, but
* only when CONFIG_DEBUG_SET_MODULE_RONX=y
*/
+
+#ifdef CONFIG_TIMA_LKMAUTH_CODE_PROT
+# define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
+#ifdef TIMA_LKM_SET_PAGE_ATTRIB
+#define debug_align(X) ALIGN(X, PAGE_SIZE)
+#else
#ifdef CONFIG_DEBUG_SET_MODULE_RONX
# define debug_align(X) ALIGN(X, PAGE_SIZE)
#else
# define debug_align(X) (X)
#endif
+#endif
+#endif
/*
* Given BASE and SIZE this macro calculates the number of pages the
@@ -2563,6 +2666,489 @@
}
#endif /* CONFIG_KALLSYMS */
+#ifdef CONFIG_TIMA_LKMAUTH
+#ifdef CONFIG_TZDEV // TEEgris
+static int lkmauth(Elf_Ehdr * hdr, int len)
+{
+ int ret = RET_TL_TIMA_LKMAUTH_LKM_BLOCK_FORCE;
+ pr_warn("TIMA: lkmauth--LKM is disapproved by Samsung security policy.\n");
+ return ret;
+}
+#else // Kinibi
+/* read file into the buf, return the file size */
+int read_file_buf(char *filename, void **buf)
+{
+ struct file *f;
+ int file_size = 0;
+ mm_segment_t fs;
+
+ f = filp_open(filename, O_RDONLY, 0);
+ if (!IS_ERR(f)) {
+ // Get current segment descriptor
+ fs = get_fs();
+ // Set segment descriptor associated to kernel space
+ set_fs(get_ds());
+ file_size = f->f_mapping->host->i_size;
+ pr_info("TIMA: lkmauth--File %s has %d bytes.\n", filename,
+ file_size);
+ *buf = vmalloc(file_size);
+ // Read the file
+ f->f_op->read(f, *buf, file_size, &f->f_pos);
+ // Restore segment descriptor
+ set_fs(fs);
+ filp_close(f, NULL);
+ } else {
+ pr_err("TIMA: lkmauth--filp_open error for %s!!.\n", filename);
+ }
+ return file_size;
+}
+
+int send_notification(lkmauth_rsp_t * krsp, int ret)
+{
+ char *envp[3], *status, *result;
+
+ /* Send a notification through uevent. Note that the lkmauth tzapp
+ * should have already raised an alert in TZ Security log.
+ */
+ status = kzalloc(16, GFP_KERNEL);
+ if (!status) {
+ pr_err("TIMA: lkmauth--%s kmalloc failed.\n", __func__);
+ return -1;
+ }
+ snprintf(status, 16, "TIMA_STATUS=%d", ret);
+ envp[0] = status;
+
+ result = kzalloc(256, GFP_KERNEL);
+ if (!result) {
+ pr_err("TIMA: lkmauth--%s kmalloc failed.\n", __func__);
+ kfree(envp[0]);
+ return -1;
+ }
+ snprintf(result, 256, "TIMA_RESULT=%s",
+ krsp->result.result_ondemand);
+ pr_warn("TIMA: %s result (%s) \n", krsp->result.result_ondemand,
+ result);
+ envp[1] = result;
+ envp[2] = NULL;
+
+ kobject_uevent_env(&tima_uevent_dev->kobj, KOBJ_CHANGE, envp);
+ kfree(envp[0]);
+ kfree(envp[1]);
+ return 0;
+}
+
+#ifndef CONFIG_TIMA_LKM_BLOCK
+static int lkmauth(Elf_Ehdr * hdr, int len)
+{
+ int ret = RET_LKMAUTH_FAIL; /* value to be returned for lkmauth */
+ lkmauth_hash_t *khashreq = NULL;
+ lkmauth_req_t *kreq = NULL;
+ lkmauth_rsp_t *krsp = NULL;
+ enum mc_result mc_ret;
+ struct mc_uuid_t uuid = TL_TIMA_LKMAUTH_UUID;
+ struct mc_bulk_map map_info;
+ void *buf;
+ int buf_len;
+ int nb_of_1mb_section;
+ int idx_mapping_section;
+ int mapping_len;
+ uint8_t *hdr_local = (uint8_t *)hdr;
+
+ mutex_lock(&lkmauth_mutex);
+ nb_of_1mb_section = ( len + MC_MAPPING_MAX_SIZE - 1 )/MC_MAPPING_MAX_SIZE;
+ pr_warn
+ ("TIMA: lkmauth--launch the tl to check kernel module; module len is %d\n",
+ len);
+
+ /* Load the lkmauth tl and handle potential error conditions.
+ */
+ if (!lkmauth_tl_loaded) {
+ mc_ret = mc_open_device(MC_DEVICE_ID_DEFAULT);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot get mobicore handle from kernel. %d\n",
+ mc_ret);
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_ret;
+ }
+ /* open session for lkmauth trustlet */
+ mc_ret =
+ mc_malloc_wsm(MC_DEVICE_ID_DEFAULT, 0, sizeof(tciMessage_t),
+ &tci, 0);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot alloc world shared memory.\n");
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_close_device;
+ }
+ memset(&mchandle, 0, sizeof(struct mc_session_handle));
+ mchandle.device_id = MC_DEVICE_ID_DEFAULT;
+ mc_ret =
+ mc_open_session(&mchandle, &uuid, tci,
+ sizeof(tciMessage_t));
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot open mobicore session from kernel. %d\n",
+ mc_ret);
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_free_wsm;
+ }
+ /* open session for tima driver */
+ mc_ret =
+ mc_malloc_wsm(MC_DEVICE_ID_DEFAULT, 0, 4096, &drv_tci, 0);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot alloc world shared memory for tima driver.\n");
+ ret = RET_LKMAUTH_FAIL; /* lkm authentication failed. */
+ goto lkmauth_close_session; /* leave the function now. */
+ }
+ memset(&drv_mchandle, 0, sizeof(struct mc_session_handle));
+ drv_mchandle.device_id = MC_DEVICE_ID_DEFAULT;
+ lkmauth_tl_loaded = 1; /* both lkmauth tl and tima secure driver is loaded */
+ }
+
+ if (!lkm_sec_info_loaded) {
+ /* load lkm_sec_info */
+ buf_len = read_file_buf("/system/lkm_sec_info", &buf);
+ if (buf_len == 0) {
+ pr_err
+ ("TIMA: lkmauth-- cannot allocate buffer for lkm_sec_info\n");
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_ret;
+ }
+
+ /* map lkm_sec_info buf to tl virutal space */
+ mc_ret = mc_map(&mchandle, (void *)buf, buf_len, &map_info);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot map lkm_sec_info buf to tl virtual space\n");
+ ret = RET_LKMAUTH_FAIL;
+ vfree(buf);
+ goto lkmauth_ret;
+ }
+
+ /* Generate the request cmd to load lkm_sec_info.
+ */
+ khashreq = (struct lkmauth_hash_s *)tci;
+ khashreq->cmd_id = CMD_TIMA_LKMAUTH_LOAD_HASH;
+ /* pr_warn("TIMA: lkmauth -- virtual address of lkm_sec_info buffer in tl is : %x\n", (uint32_t)map_info.secure_virt_addr);
+ */
+ khashreq->hash_buf_start = (uint32_t) map_info.secure_virt_addr;
+ khashreq->hash_buf_len = buf_len;
+
+ /* prepare the response buffer */
+ krsp = (struct lkmauth_rsp_s *)tci;
+
+ /* Send the command to the tl.
+ */
+ mc_ret = mc_notify(&mchandle);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--mc_notify failed.\n");
+ ret = RET_LKMAUTH_FAIL;
+ mc_unmap(&mchandle, (void *)buf, &map_info);
+ vfree(buf);
+ goto lkmauth_ret;
+ }
+
+retry1:
+ mc_ret = mc_wait_notification(&mchandle, -1);
+ if (MC_DRV_ERR_INTERRUPTED_BY_SIGNAL == mc_ret) {
+ usleep_range(1000, 5000);
+ goto retry1;
+ }
+
+ if (mc_ret != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--wait_notify failed.\n");
+ ret = RET_LKMAUTH_FAIL;
+ mc_unmap(&mchandle, buf, &map_info);
+ vfree(buf);
+ goto lkmauth_ret;
+ }
+ pr_warn("TIMA: lkmauth--wait_notify completed.\n");
+
+ /* Process potential error conditions for the tl response.
+ */
+ mc_ret = mc_unmap(&mchandle, buf, &map_info);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot unmap lkm_sec_info buf\n");
+ ret = RET_LKMAUTH_FAIL;
+ vfree(buf);
+ goto lkmauth_ret;
+ }
+
+ vfree(buf);
+
+ /* Parse the tl response for loading lkm_sec_info.
+ */
+ if (krsp->ret == RET_TL_TIMA_LKMAUTH_OK) {
+ pr_info
+ ("TIMA: lkmauth--lkm_sec_info sucessfully loaded\n");
+ ret = RET_LKMAUTH_SUCCESS;
+ lkm_sec_info_loaded = 1;
+ } else if (krsp->ret == RET_TL_TIMA_LKMAUTH_HASH_LOADED) {
+ pr_info("TIMA: lkmauth--lkm_sec_info already loaded\n");
+ ret = RET_LKMAUTH_FAIL;
+ lkm_sec_info_loaded = 1;
+ } else {
+ pr_err("TIMA: lkmauth--lkm_sec_info load error (%d)\n",
+ krsp->ret);
+ ret = RET_LKMAUTH_FAIL;
+ send_notification(krsp, ret);
+ goto lkmauth_ret;
+ }
+ }
+
+ /* map ko buf to tl virtual space */
+ for ( idx_mapping_section = 0; idx_mapping_section < nb_of_1mb_section; idx_mapping_section++ ){
+ if ( idx_mapping_section == nb_of_1mb_section -1 ){
+ mapping_len = len - idx_mapping_section * MC_MAPPING_MAX_SIZE;
+ }
+ else
+ {
+ mapping_len = MC_MAPPING_MAX_SIZE;
+ }
+ mc_ret = mc_map(&mchandle, (void *)hdr_local, mapping_len, &map_info);
+
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot map ko buf to tl virtual space %d\n", mc_ret);
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_ret;
+ }
+
+ /* Generate the request cmd to verify hash of ko.
+ */
+ kreq = (struct lkmauth_req_s *)tci;
+ kreq->cmd_id = CMD_TIMA_LKMAUTH_VERIFY_MODULE;
+ /* pr_warn("TIMA: lkmauth -- virtual address of ko buffer in tl is : %x\n", (uint32_t)map_info.secure_virt_addr);
+ */
+ kreq->module_addr_start = (uint32_t) map_info.secure_virt_addr;
+ kreq->module_len = mapping_len;
+
+ /* prepare the response buffer */
+ krsp = (struct lkmauth_rsp_s *)tci;
+
+ /* Send the command to the tl.
+ */
+ mc_ret = mc_notify(&mchandle);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--mc_notify failed.\n");
+ ret = RET_LKMAUTH_FAIL;
+ mc_unmap(&mchandle, (void *)hdr_local, &map_info);
+ goto lkmauth_ret;
+ }
+
+retry2:
+ mc_ret = mc_wait_notification(&mchandle, -1);
+ if (MC_DRV_ERR_INTERRUPTED_BY_SIGNAL == mc_ret) {
+ usleep_range(1000, 5000);
+ goto retry2;
+ }
+
+ if (mc_ret != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--wait_notify failed.\n");
+ ret = RET_LKMAUTH_FAIL;
+ mc_unmap(&mchandle, (void *)hdr_local, &map_info);
+ goto lkmauth_ret;
+ }
+ pr_warn("TIMA: lkmauth--wait_notify completed.\n");
+
+ mc_ret = mc_unmap(&mchandle, (void *)hdr_local, &map_info);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--cannot unmap ko memory\n");
+ }
+
+ /* Parse the tl response.
+ */
+ if (krsp->ret == 0) {
+ pr_warn("TIMA: lkmauth--section verification succeeded idx : %d\n", idx_mapping_section);
+ hdr_local = hdr_local + MC_MAPPING_MAX_SIZE;
+ continue;
+ } else {
+
+ pr_err("TIMA: lkmauth--verification failed %d\n", krsp->ret);
+ ret = RET_LKMAUTH_FAIL;
+ send_notification(krsp, ret);
+ goto lkmauth_ret;
+ }
+ }
+ pr_warn("TIMA: lkmauth--verification succeeded.\n");
+ ret = RET_LKMAUTH_SUCCESS; /* ret should already be 0 before the assignment. */
+ goto lkmauth_ret;
+
+lkmauth_close_session:
+ if (mc_close_session(&mchandle) != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--failed to close mobicore session.\n");
+ }
+
+lkmauth_free_wsm:
+ if (mc_free_wsm(MC_DEVICE_ID_DEFAULT, tci) != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--failed to free wsm.\n");
+ }
+
+lkmauth_close_device:
+ if (mc_close_device(MC_DEVICE_ID_DEFAULT) != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--failed to shutdown mobicore instance.\n");
+ }
+
+lkmauth_ret:
+ mutex_unlock(&lkmauth_mutex);
+ return ret;
+}
+#else
+static int lkmauth(Elf_Ehdr * hdr, int len)
+{
+ int ret = RET_LKMAUTH_FAIL; /* value to be returned for lkmauth */
+ lkmauth_req_t *kreq = NULL;
+ lkmauth_rsp_t *krsp = NULL;
+ enum mc_result mc_ret;
+ struct mc_uuid_t uuid = TL_TIMA_LKMAUTH_UUID;
+ struct mc_bulk_map map_info;
+ uint8_t *hdr_local = (uint8_t *)hdr;
+
+ mutex_lock(&lkmauth_mutex);
+ pr_warn
+ ("TIMA: lkmauth--launch the tl to check kernel module; module len is %d\n",
+ len);
+
+ /* Load the lkmauth tl and handle potential error conditions.
+ */
+ if (!lkmauth_tl_loaded) {
+ mc_ret = mc_open_device(MC_DEVICE_ID_DEFAULT);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot get mobicore handle from kernel. %d\n",
+ mc_ret);
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_ret;
+ }
+ /* open session for lkmauth trustlet */
+ mc_ret =
+ mc_malloc_wsm(MC_DEVICE_ID_DEFAULT, 0, sizeof(tciMessage_t),
+ &tci, 0);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot alloc world shared memory.\n");
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_close_device;
+ }
+ memset(&mchandle, 0, sizeof(struct mc_session_handle));
+ mchandle.device_id = MC_DEVICE_ID_DEFAULT;
+ mc_ret =
+ mc_open_session(&mchandle, &uuid, tci,
+ sizeof(tciMessage_t));
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot open mobicore session from kernel. %d\n",
+ mc_ret);
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_free_wsm;
+ }
+ /* open session for tima driver */
+ mc_ret =
+ mc_malloc_wsm(MC_DEVICE_ID_DEFAULT, 0, 4096, &drv_tci, 0);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot alloc world shared memory for tima driver.\n");
+ ret = RET_LKMAUTH_FAIL; /* lkm authentication failed. */
+ goto lkmauth_close_session; /* leave the function now. */
+ }
+ memset(&drv_mchandle, 0, sizeof(struct mc_session_handle));
+ drv_mchandle.device_id = MC_DEVICE_ID_DEFAULT;
+ lkmauth_tl_loaded = 1; /* both lkmauth tl and tima secure driver is loaded */
+ }
+
+ /* map ko buf to tl virtual space */
+ mc_ret = mc_map(&mchandle, (void *)hdr_local, len, &map_info);
+
+ if (mc_ret != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--cannot map ko buf to tl virtual space %d\n", mc_ret);
+ ret = RET_LKMAUTH_FAIL;
+ goto lkmauth_ret;
+ }
+
+ /* Generate the request cmd to verify hash of ko.
+ */
+ kreq = (struct lkmauth_req_s *)tci;
+ kreq->cmd_id = CMD_TIMA_LKMAUTH_LKM_BLOCK;
+ /* pr_warn("TIMA: lkmauth -- virtual address of ko buffer in tl is : %x\n", (uint32_t)map_info.secure_virt_addr);
+ */
+ kreq->module_addr_start = (uint32_t) map_info.secure_virt_addr;
+ kreq->module_len = len;
+
+ /* prepare the response buffer */
+ krsp = (struct lkmauth_rsp_s *)tci;
+
+ /* Send the command to the tl.
+ */
+ mc_ret = mc_notify(&mchandle);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--mc_notify failed.\n");
+ ret = RET_LKMAUTH_FAIL;
+ mc_unmap(&mchandle, (void *)hdr_local, &map_info);
+ goto lkmauth_ret;
+ }
+
+retry:
+ mc_ret = mc_wait_notification(&mchandle, -1);
+ if (MC_DRV_ERR_INTERRUPTED_BY_SIGNAL == mc_ret) {
+ usleep_range(1000, 5000);
+ goto retry;
+ }
+
+ if (mc_ret != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--wait_notify failed.\n");
+ ret = RET_LKMAUTH_FAIL;
+ mc_unmap(&mchandle, (void *)hdr_local, &map_info);
+ goto lkmauth_ret;
+ }
+ pr_warn("TIMA: lkmauth--wait_notify completed.\n");
+
+ mc_ret = mc_unmap(&mchandle, (void *)hdr_local, &map_info);
+ if (mc_ret != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--cannot unmap ko memory\n");
+ }
+
+ /* Parse the tl response.
+ */
+ if (krsp->ret == RET_TL_TIMA_LKMAUTH_LKM_BLOCK_FORCE) {
+ pr_warn("TIMA: lkmauth--Succeeded to block lkm through TZ.\n");
+ ret = krsp->ret;
+ send_notification(krsp, ret);
+ } else {
+ pr_err("TIMA: lkmauth--Failed to block lkm through TZ. ret = %x\n", krsp->ret);
+ ret = RET_LKMAUTH_FAIL;
+ send_notification(krsp, ret);
+ }
+ goto lkmauth_ret;
+
+lkmauth_close_session:
+ if (mc_close_session(&mchandle) != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--failed to close mobicore session.\n");
+ }
+
+lkmauth_free_wsm:
+ if (mc_free_wsm(MC_DEVICE_ID_DEFAULT, tci) != MC_DRV_OK) {
+ pr_err("TIMA: lkmauth--failed to free wsm.\n");
+ }
+
+lkmauth_close_device:
+ if (mc_close_device(MC_DEVICE_ID_DEFAULT) != MC_DRV_OK) {
+ pr_err
+ ("TIMA: lkmauth--failed to shutdown mobicore instance.\n");
+ }
+
+lkmauth_ret:
+ mutex_unlock(&lkmauth_mutex);
+ return ret;
+}
+#endif /* CONFIG_TIMA_LKM_BLOCK */
+#endif /* CONFIG_TZDEV */
+#endif /* CONFIG_TIMA_LKMAUTH */
+
static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
{
if (!debug)
@@ -2666,6 +3252,14 @@
info->len - info->hdr->e_shoff))
return -ENOEXEC;
+#ifdef CONFIG_TIMA_LKMAUTH
+ if (lkmauth(info->hdr, info->len) != RET_LKMAUTH_SUCCESS) {
+ pr_err
+ ("TIMA: lkmauth--unable to load kernel module; module len is %lu.\n",
+ info->len);
+ return -ENOEXEC;
+ }
+#endif
return 0;
}
@@ -3243,6 +3837,168 @@
#endif
}
+#ifdef CONFIG_TIMA_LKMAUTH_CODE_PROT
+#ifndef TIMA_KERNEL_L1_MANAGE
+static inline pmd_t *tima_pmd_off_k(unsigned long virt)
+{
+ return pmd_offset(pud_offset(pgd_offset_k(virt), virt), virt);
+}
+
+void tima_set_pte_val(unsigned long virt,int numpages,int flags)
+{
+ unsigned long start = virt;
+ unsigned long end = virt + (numpages << PAGE_SHIFT);
+ unsigned long pmd_end;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ while (virt < end)
+ {
+ pmd =tima_pmd_off_k(virt);
+ pmd_end = min(ALIGN(virt + 1, PMD_SIZE), end);
+
+ if ((pmd_val(*pmd) & PMD_TYPE_MASK) != PMD_TYPE_TABLE) {
+ //printk("Not a pagetable\n");
+ virt = pmd_end;
+ continue;
+ }
+
+ while (virt < pmd_end)
+ {
+ pte = pte_offset_kernel(pmd, virt);
+ if(flags == TIMA_SET_PTE_RO)
+ {
+ /*Make pages readonly*/
+ ptep_set_wrprotect(current->mm, virt,pte);
+ }
+ if(flags == TIMA_SET_PTE_NX)
+ {
+ /*Make pages Non Executable*/
+ ptep_set_nxprotect(current->mm, virt,pte);
+ }
+ virt += PAGE_SIZE;
+ }
+ }
+
+ flush_tlb_kernel_range(start, end);
+}
+#endif
+
+/**
+ * tima_mod_page_change_access - Wrapper function to change access control permissions of pages
+ *
+ * It sends code and data pages to secure side to make code pages readonly and data pages non executable
+ *
+ */
+
+void tima_mod_page_change_access(struct module *mod)
+{
+ unsigned int *vatext,*vadata;/* base virtual address of text and data regions*/
+ unsigned int text_count,data_count;/* Number of text and data pages present in core section */
+
+ /*Lets first pickup core section */
+ vatext = mod->module_core;
+ vadata = (int *)((char *)(mod->module_core) + mod->core_ro_size);
+ text_count = ((char *)vadata - (char *)vatext);
+ data_count = debug_align(mod->core_size) - text_count;
+ text_count = text_count / PAGE_SIZE;
+ data_count = data_count / PAGE_SIZE;
+
+ /*Should be atleast a page */
+ if(!text_count)
+ text_count = 1;
+ if(!data_count)
+ data_count = 1;
+ /* Change permissive bits for core section and making Code read only, Data Non Executable*/
+ tima_set_pte_val( (unsigned long)vatext,text_count,TIMA_SET_PTE_RO);
+ tima_set_pte_val( (unsigned long)vadata,data_count,TIMA_SET_PTE_NX);
+
+ /*Lets pickup init section */
+ vatext = mod->module_init;
+ vadata = (int *)((char *)(mod->module_init) + mod->init_ro_size);
+ text_count = ((char *)vadata - (char *)vatext);
+ data_count = debug_align(mod->init_size) - text_count;
+ text_count = text_count / PAGE_SIZE;
+ data_count = data_count / PAGE_SIZE;
+
+/* Change permissive bits for init section and making Code read only,Data Non Executable*/
+ tima_set_pte_val( (unsigned long)vatext,text_count,TIMA_SET_PTE_RO);
+ tima_set_pte_val( (unsigned long)vadata,data_count,TIMA_SET_PTE_NX);
+}
+
+#endif /*CONFIG_TIMA_LKMAUTH_CODE_PROT*/
+
+#ifdef TIMA_LKM_SET_PAGE_ATTRIB
+void tima_mod_send_smc_instruction(unsigned int *vatext, unsigned int *vadata,
+ unsigned int text_count,
+ unsigned int data_count)
+{
+ unsigned long cmd_id = TIMA_PAC_CMD_ID;
+ /*Call SMC instruction */
+#if __GNUC__ >= 4 && __GNUC_MINOR__ >= 6
+ __asm__ __volatile__(".arch_extension sec\n");
+#endif
+ __asm__ __volatile__("stmfd sp!,{r0-r4,r11}\n"
+ "mov r11, r0\n"
+ "mov r0, %0\n"
+ "mov r1, %1\n"
+ "mov r2, %2\n"
+ "mov r3, %3\n"
+ "mov r4, %4\n"
+ "smc #11\n"
+ "mov r6, #0\n"
+ "pop {r0-r4,r11}\n"
+ "mcr p15, 0, r6, c8, c3, 0\n"
+ "dsb\n"
+ "isb\n"::"r"(cmd_id), "r"(vatext), "r"(text_count),
+ "r"(vadata), "r"(data_count):"r0", "r1", "r2",
+ "r3", "r4", "r11", "cc");
+
+}
+
+/**
+ * tima_mod_page_change_access - Wrapper function to change access control permissions of pages
+ *
+ * It sends code and data pages to secure side to make code pages readonly and data pages non executable
+ *
+ */
+
+void tima_mod_page_change_access(struct module *mod)
+{
+ unsigned int *vatext, *vadata; /* base virtual address of text and data regions */
+ unsigned int text_count, data_count; /* Number of text and data pages present in core section */
+
+ /*Lets first pickup core section */
+ vatext = mod->module_core;
+ vadata = (int *)((char *)(mod->module_core) + mod->core_ro_size);
+ text_count = ((char *)vadata - (char *)vatext);
+ data_count = debug_align(mod->core_size) - text_count;
+ text_count = text_count / PAGE_SIZE;
+ data_count = data_count / PAGE_SIZE;
+
+ /*Should be atleast a page */
+ if (!text_count)
+ text_count = 1;
+ if (!data_count)
+ data_count = 1;
+
+ /* Change permissive bits for core section */
+ tima_mod_send_smc_instruction(vatext, vadata, text_count, data_count);
+
+ /*Lets pickup init section */
+ vatext = mod->module_init;
+ vadata = (int *)((char *)(mod->module_init) + mod->init_ro_size);
+ text_count = ((char *)vadata - (char *)vatext);
+ data_count = debug_align(mod->init_size) - text_count;
+ text_count = text_count / PAGE_SIZE;
+ data_count = data_count / PAGE_SIZE;
+
+ /* Change permissive bits for init section */
+ tima_mod_send_smc_instruction(vatext, vadata, text_count, data_count);
+}
+
+#endif
+
/* For freeing module_init on success, in case kallsyms traversing */
struct mod_initfree {
struct rcu_head rcu;
@@ -3300,6 +4056,10 @@
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_LIVE, mod);
+#ifdef TIMA_LKM_SET_PAGE_ATTRIB
+ tima_mod_page_change_access(mod);
+#endif
+
/*
* We need to finish all async code before the module init sequence
* is done. This has potential to deadlock. For example, a newly
@@ -3423,6 +4183,10 @@
/* This relies on module_mutex for list integrity. */
module_bug_finalize(info->hdr, info->sechdrs, mod);
+#ifdef CONFIG_TIMA_LKMAUTH_CODE_PROT
+ tima_mod_page_change_access(mod);
+#endif /*CONFIG_TIMA_LKMAUTH_CODE_PROT*/
+
/* Set RO and NX regions for core */
set_section_ro_nx(mod->module_core,
mod->core_text_size,
diff --git a/kernel/module_signing.c b/kernel/module_signing.c
old mode 100644
new mode 100755
diff --git a/kernel/notifier.c b/kernel/notifier.c
old mode 100644
new mode 100755
index fd2c9ac..06be999
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -5,6 +5,8 @@
#include <linux/rcupdate.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
+#include <linux/suspend.h>
+#include <linux/exynos-ss.h>
/*
* Notifier list for kernel code which wants to be called
@@ -90,7 +92,11 @@
continue;
}
#endif
+ if (val == PM_SUSPEND_PREPARE || val == PM_POST_SUSPEND)
+ exynos_ss_suspend(nb->notifier_call, NULL, ESS_FLAG_IN);
ret = nb->notifier_call(nb, val, v);
+ if (val == PM_SUSPEND_PREPARE || val == PM_POST_SUSPEND)
+ exynos_ss_suspend(nb->notifier_call, NULL, ESS_FLAG_OUT);
if (nr_calls)
(*nr_calls)++;
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
old mode 100644
new mode 100755
diff --git a/kernel/padata.c b/kernel/padata.c
old mode 100644
new mode 100755
diff --git a/kernel/panic.c b/kernel/panic.c
old mode 100644
new mode 100755
index 1d07cf9..b2251ac
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -24,10 +24,24 @@
#include <linux/init.h>
#include <linux/nmi.h>
#include <linux/console.h>
+#include <linux/exynos-ss.h>
+#include <soc/samsung/exynos-condbg.h>
+#include <asm/core_regs.h>
+#include <sound/samsung/abox.h>
+#include "sched/sched.h"
+
+#include <asm/core_regs.h>
+
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+#include <linux/sec_debug.h>
+#endif
#define PANIC_TIMER_STEP 100
#define PANIC_BLINK_SPD 18
+/* Machine specific panic information string */
+char *mach_panic_string;
+
int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE;
static unsigned long tainted_mask;
static int pause_on_oops;
@@ -61,6 +75,22 @@
cpu_relax();
}
+#ifdef CONFIG_ARM_CCI550_DEBUG_MODE
+#define CCI_BASE (0x12200000)
+#define IMPR_ERR_OFFSET (0x00010)
+
+void __iomem *cci_virt_base;
+
+void show_cci_info(void)
+{
+ u32 data;
+
+ pr_emerg("CCI debug info:\n");
+ data = __raw_readl(cci_virt_base + IMPR_ERR_OFFSET);
+ pr_emerg("\tIMPR_ERR %08x\n", data);
+}
+#endif
+
/**
* panic - halt the system
* @fmt: The text string to print
@@ -77,6 +107,14 @@
long i, i_next = 0;
int state = 0;
+ exynos_trace_stop();
+
+ if (ecd_get_enable() &&
+ ecd_get_debug_panic() &&
+ ecd_get_debug_mode() != MODE_DEBUG) {
+ ecd_printf("Debugging in Panic on ECD\n");
+ ecd_do_break_now();
+ }
/*
* Disable local interrupts. This will prevent panic_smp_self_stop
* from deadlocking the first cpu that invokes the panic, since
@@ -95,15 +133,39 @@
* stop themself or will wait until they are stopped by the 1st CPU
* with smp_send_stop().
*/
- if (!spin_trylock(&panic_lock))
+ if (!spin_trylock(&panic_lock)) {
+ exynos_ss_hook_hardlockup_exit();
panic_smp_self_stop();
+ }
console_verbose();
bust_spinlocks(1);
va_start(args, fmt);
vsnprintf(buf, sizeof(buf), fmt, args);
va_end(args);
- pr_emerg("Kernel panic - not syncing: %s\n", buf);
+
+#ifdef CONFIG_SEC_DEBUG_AUTO_SUMMARY
+ if (buf[strlen(buf) - 1] == '\n')
+ buf[strlen(buf) - 1] = '\0';
+#endif
+
+ ecd_printf("Kernel Panic - not syncing: %s\n", buf);
+ pr_auto(ASL5, "Kernel panic - not syncing: %s\n", buf);
+
+#ifdef CONFIG_ARM_CCI550_DEBUG_MODE
+ show_cci_info();
+#endif
+
+#ifdef CONFIG_RELOCATABLE_KERNEL
+ {
+ extern u64 *__boot_kernel_offset;
+ u64 *kernel_addr = (u64 *) &__boot_kernel_offset;
+ pr_emerg("Kernel loaded at: 0x%llx, offset from compile-time address %llx\n", kernel_addr[1]+kernel_addr[0], kernel_addr[1]-kernel_addr[2]);
+ }
+#endif
+ exynos_ss_prepare_panic();
+ exynos_ss_dump_panic(buf, (size_t)strnlen(buf, sizeof(buf)));
+// exynos_abox_dump_sram();
#ifdef CONFIG_DEBUG_BUGVERBOSE
/*
* Avoid nested stack-dumping if a panic occurs during oops processing
@@ -111,7 +173,10 @@
if (!test_taint(TAINT_DIE) && oops_in_progress <= 1)
dump_stack();
#endif
-
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+ sec_debug_save_panic_info(buf, (unsigned long)__builtin_return_address(0));
+#endif
+ sysrq_sched_debug_show();
/*
* If we have crashed and we have a crash kernel loaded let it handle
* everything else.
@@ -126,7 +191,9 @@
* unfortunately means it may not be hardened to work in a panic
* situation.
*/
- smp_send_stop();
+
+ if (!ecd_get_enable() || ecd_get_debug_mode() != MODE_DEBUG)
+ smp_send_stop();
/*
* Run any panic handlers, including those that might need to
@@ -136,6 +203,8 @@
kmsg_dump(KMSG_DUMP_PANIC);
+ exynos_ss_post_panic();
+
/*
* If you doubt kdump always works fine in any situation,
* "crash_kexec_post_notifiers" offers you a chance to run
@@ -404,7 +473,11 @@
get_random_bytes(&oops_id, sizeof(oops_id));
else
oops_id++;
-
+#ifdef CONFIG_ARM_CCI550_DEBUG_MODE
+ cci_virt_base = ioremap(CCI_BASE, SZ_4K);
+ if (!cci_virt_base)
+ pr_err("Unable to map SRAM to setup the CCI address\n");
+#endif
return 0;
}
late_initcall(init_oops_id);
@@ -412,6 +485,11 @@
void print_oops_end_marker(void)
{
init_oops_id();
+
+ if (mach_panic_string)
+ printk(KERN_WARNING "Board Information: %s\n",
+ mach_panic_string);
+
pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id);
}
diff --git a/kernel/params.c b/kernel/params.c
old mode 100644
new mode 100755
diff --git a/kernel/pid.c b/kernel/pid.c
old mode 100644
new mode 100755
index 5fe7cdb..733ee99
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -531,7 +531,7 @@
if (type != PIDTYPE_PID) {
if (type == __PIDTYPE_TGID)
type = PIDTYPE_PID;
- task = task->group_leader;
+ task = task->group_leader;
}
nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns);
}
@@ -575,10 +575,12 @@
{
unsigned int i, pidhash_size;
+ set_memsize_kernel_type(MEMSIZE_KERNEL_PIDHASH);
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
HASH_EARLY | HASH_SMALL,
&pidhash_shift, NULL,
0, 4096);
+ set_memsize_kernel_type(MEMSIZE_KERNEL_OTHERS);
pidhash_size = 1U << pidhash_shift;
for (i = 0; i < pidhash_size; i++)
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
old mode 100644
new mode 100755
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
old mode 100644
new mode 100755
index 9d76184..4335e7d
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -1,6 +1,7 @@
config SUSPEND
bool "Suspend to RAM and standby"
depends on ARCH_SUSPEND_POSSIBLE
+ select RTC_LIB
default y
---help---
Allow the system to enter sleep states in which main memory is
@@ -28,6 +29,15 @@
of suspend, or they are content with invoking sync() from
user-space before invoking suspend. Say Y if that's your case.
+config WAKELOCK
+ bool "Android's method of preventing suspend"
+ default y
+ ---help---
+ This allows applications to prevent the CPU from suspending while
+ they need it.
+
+ Say Y if you are running an android userspace.
+
config HIBERNATE_CALLBACKS
bool
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
old mode 100644
new mode 100755
index cb880a1..22eb9ed
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -12,3 +12,5 @@
obj-$(CONFIG_PM_WAKELOCKS) += wakelock.o
obj-$(CONFIG_MAGIC_SYSRQ) += poweroff.o
+
+obj-$(CONFIG_SUSPEND) += wakeup_reason.o
diff --git a/kernel/power/autosleep.c b/kernel/power/autosleep.c
old mode 100644
new mode 100755
diff --git a/kernel/power/console.c b/kernel/power/console.c
old mode 100644
new mode 100755
diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
old mode 100644
new mode 100755
index 3124ceb..797f19e
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -647,7 +647,7 @@
*/
int hibernate(void)
{
- int error;
+ int error, nr_calls = 0;
if (!hibernation_available()) {
pr_debug("PM: Hibernation not available.\n");
@@ -662,9 +662,11 @@
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Exit;
+ }
printk(KERN_INFO "PM: Syncing filesystems ... ");
sys_sync();
@@ -714,7 +716,7 @@
/* Don't bother checking whether freezer_test_done is true */
freezer_test_done = false;
Exit:
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
Unlock:
@@ -740,7 +742,7 @@
*/
static int software_resume(void)
{
- int error;
+ int error, nr_calls = 0;
unsigned int flags;
/*
@@ -827,9 +829,11 @@
}
pm_prepare_console();
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
+ if (error) {
+ nr_calls--;
goto Close_Finish;
+ }
pr_debug("PM: Preparing processes for restore.\n");
error = freeze_processes();
@@ -855,7 +859,7 @@
unlock_device_hotplug();
thaw_processes();
Finish:
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
pm_restore_console();
atomic_inc(&snapshot_device_available);
/* For success case, the suspend path will release the lock */
diff --git a/kernel/power/main.c b/kernel/power/main.c
old mode 100644
new mode 100755
index b2dd4d9..99d4c02
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -38,12 +38,40 @@
}
EXPORT_SYMBOL_GPL(unregister_pm_notifier);
-int pm_notifier_call_chain(unsigned long val)
+int __pm_notifier_call_chain(unsigned long val, int nr_to_call, int *nr_calls)
{
- int ret = blocking_notifier_call_chain(&pm_chain_head, val, NULL);
+ int ret;
+
+ ret = __blocking_notifier_call_chain(&pm_chain_head, val, NULL,
+ nr_to_call, nr_calls);
return notifier_to_errno(ret);
}
+int pm_notifier_call_chain(unsigned long val)
+{
+ return __pm_notifier_call_chain(val, -1, NULL);
+}
+
+#ifdef CONFIG_SEC_PM_DEBUG
+void *pm_notifier_call_chain_get_callback(int nr_calls)
+{
+ struct notifier_block *nb, *next_nb;
+ int nr_to_call = nr_calls;
+
+ nb = rcu_dereference_raw(pm_chain_head.head);
+
+ while (nb && nr_to_call) {
+ next_nb = rcu_dereference_raw(nb->next);
+ nb = next_nb;
+ nr_to_call--;
+ }
+
+ if (nb)
+ return (void *)nb->notifier_call;
+ else
+ return ERR_PTR(-ENODATA);
+}
+#endif /* CONFIG_SEC_PM_DEBUG */
/* If set, devices may be suspended and resumed asynchronously. */
int pm_async_enabled = 1;
diff --git a/kernel/power/power.h b/kernel/power/power.h
old mode 100644
new mode 100755
index 25367fc..02d80ef
--- a/kernel/power/power.h
+++ b/kernel/power/power.h
@@ -188,7 +188,12 @@
#ifdef CONFIG_PM_SLEEP
/* kernel/power/main.c */
+extern int __pm_notifier_call_chain(unsigned long val, int nr_to_call,
+ int *nr_calls);
extern int pm_notifier_call_chain(unsigned long val);
+#ifdef CONFIG_SEC_PM_DEBUG
+extern void *pm_notifier_call_chain_get_callback(int nr_calls);
+#endif /* CONFIG_SEC_PM_DEBUG */
#endif
#ifdef CONFIG_HIGHMEM
diff --git a/kernel/power/poweroff.c b/kernel/power/poweroff.c
old mode 100644
new mode 100755
diff --git a/kernel/power/process.c b/kernel/power/process.c
old mode 100644
new mode 100755
index ba2029a..cc17714
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -18,6 +18,7 @@
#include <linux/workqueue.h>
#include <linux/kmod.h>
#include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
#include <linux/cpuset.h>
/*
@@ -36,6 +37,9 @@
unsigned int elapsed_msecs;
bool wakeup = false;
int sleep_usecs = USEC_PER_MSEC;
+#ifdef CONFIG_PM_SLEEP
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+#endif
do_gettimeofday(&start);
@@ -65,6 +69,11 @@
break;
if (pm_wakeup_pending()) {
+#ifdef CONFIG_PM_SLEEP
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
+#endif
wakeup = true;
break;
}
@@ -84,15 +93,17 @@
do_div(elapsed_msecs64, NSEC_PER_MSEC);
elapsed_msecs = elapsed_msecs64;
- if (todo) {
+ if (wakeup) {
pr_cont("\n");
- pr_err("Freezing of tasks %s after %d.%03d seconds "
- "(%d tasks refusing to freeze, wq_busy=%d):\n",
- wakeup ? "aborted" : "failed",
+ pr_err("Freezing of tasks aborted after %d.%03d seconds",
+ elapsed_msecs / 1000, elapsed_msecs % 1000);
+ } else if (todo) {
+ pr_cont("\n");
+ pr_err("Freezing of tasks failed after %d.%03d seconds"
+ " (%d tasks refusing to freeze, wq_busy=%d):\n",
elapsed_msecs / 1000, elapsed_msecs % 1000,
todo - wq_busy, wq_busy);
- if (!wakeup) {
read_lock(&tasklist_lock);
for_each_process_thread(g, p) {
if (p != current && !freezer_should_skip(p)
@@ -100,7 +111,6 @@
sched_show_task(p);
}
read_unlock(&tasklist_lock);
- }
} else {
pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
elapsed_msecs % 1000);
diff --git a/kernel/power/qos.c b/kernel/power/qos.c
old mode 100644
new mode 100755
index 97b0df7..54dae3c
--- a/kernel/power/qos.c
+++ b/kernel/power/qos.c
@@ -92,6 +92,88 @@
};
+static BLOCKING_NOTIFIER_HEAD(device_throughput_notifier);
+static struct pm_qos_constraints device_tput_constraints = {
+ .list = PLIST_HEAD_INIT(device_tput_constraints.list),
+ .target_value = PM_QOS_DEVICE_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_DEVICE_THROUGHPUT_DEFAULT_VALUE,
+ .type = PM_QOS_FORCE_MAX,
+ .notifiers = &device_throughput_notifier,
+};
+static struct pm_qos_object device_throughput_pm_qos = {
+ .constraints = &device_tput_constraints,
+ .name = "device_throughput",
+};
+
+#ifdef CONFIG_ARM_EXYNOS_DEVFREQ_DEBUG
+static BLOCKING_NOTIFIER_HEAD(device_throughput_max_notifier);
+static struct pm_qos_constraints device_tput_max_constraints = {
+ .list = PLIST_HEAD_INIT(device_tput_max_constraints.list),
+ .target_value = PM_QOS_DEVICE_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_DEVICE_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &device_throughput_max_notifier,
+};
+static struct pm_qos_object device_throughput_max_pm_qos = {
+ .constraints = &device_tput_max_constraints,
+ .name = "device_throughput_max",
+};
+#endif
+
+static BLOCKING_NOTIFIER_HEAD(intcam_throughput_notifier);
+static struct pm_qos_constraints intcam_tput_constraints = {
+ .list = PLIST_HEAD_INIT(intcam_tput_constraints.list),
+ .target_value = PM_QOS_INTCAM_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_INTCAM_THROUGHPUT_DEFAULT_VALUE,
+ .type = PM_QOS_FORCE_MAX,
+ .notifiers = &intcam_throughput_notifier,
+};
+static struct pm_qos_object intcam_throughput_pm_qos = {
+ .constraints = &intcam_tput_constraints,
+ .name = "intcam_throughput",
+};
+
+#ifdef CONFIG_ARM_EXYNOS_DEVFREQ_DEBUG
+static BLOCKING_NOTIFIER_HEAD(intcam_throughput_max_notifier);
+static struct pm_qos_constraints intcam_tput_max_constraints = {
+ .list = PLIST_HEAD_INIT(intcam_tput_max_constraints.list),
+ .target_value = PM_QOS_INTCAM_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_INTCAM_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &intcam_throughput_max_notifier,
+};
+static struct pm_qos_object intcam_throughput_max_pm_qos = {
+ .constraints = &intcam_tput_max_constraints,
+ .name = "intcam_throughput_max",
+};
+#endif
+
+static BLOCKING_NOTIFIER_HEAD(bus_throughput_notifier);
+static struct pm_qos_constraints bus_tput_constraints = {
+ .list = PLIST_HEAD_INIT(bus_tput_constraints.list),
+ .target_value = PM_QOS_BUS_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_BUS_THROUGHPUT_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &bus_throughput_notifier,
+};
+static struct pm_qos_object bus_throughput_pm_qos = {
+ .constraints = &bus_tput_constraints,
+ .name = "bus_throughput",
+};
+
+static BLOCKING_NOTIFIER_HEAD(bus_throughput_max_notifier);
+static struct pm_qos_constraints bus_tput_max_constraints = {
+ .list = PLIST_HEAD_INIT(bus_tput_max_constraints.list),
+ .target_value = PM_QOS_BUS_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_BUS_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &bus_throughput_max_notifier,
+};
+static struct pm_qos_object bus_throughput_max_pm_qos = {
+ .constraints = &bus_tput_max_constraints,
+ .name = "bus_throughput_max",
+};
+
static BLOCKING_NOTIFIER_HEAD(network_throughput_notifier);
static struct pm_qos_constraints network_tput_constraints = {
.list = PLIST_HEAD_INIT(network_tput_constraints.list),
@@ -106,7 +188,6 @@
.name = "network_throughput",
};
-
static BLOCKING_NOTIFIER_HEAD(memory_bandwidth_notifier);
static struct pm_qos_constraints memory_bw_constraints = {
.list = PLIST_HEAD_INIT(memory_bw_constraints.list),
@@ -121,13 +202,224 @@
.name = "memory_bandwidth",
};
+static BLOCKING_NOTIFIER_HEAD(cpu_online_min_notifier);
+static struct pm_qos_constraints cpu_online_min_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_online_min_constraints.list),
+ .target_value = PM_QOS_CPU_ONLINE_MIN_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_ONLINE_MIN_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &cpu_online_min_notifier,
+};
+static struct pm_qos_object cpu_online_min_pm_qos = {
+ .constraints = &cpu_online_min_constraints,
+ .name = "cpu_online_min",
+};
+
+static BLOCKING_NOTIFIER_HEAD(cpu_online_max_notifier);
+static struct pm_qos_constraints cpu_online_max_constraints = {
+ .list = PLIST_HEAD_INIT(cpu_online_max_constraints.list),
+ .target_value = PM_QOS_CPU_ONLINE_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_CPU_ONLINE_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &cpu_online_max_notifier,
+};
+static struct pm_qos_object cpu_online_max_pm_qos = {
+ .constraints = &cpu_online_max_constraints,
+ .name = "cpu_online_max",
+};
+
+static BLOCKING_NOTIFIER_HEAD(cluster1_freq_min_notifier);
+static struct pm_qos_constraints cluster1_freq_min_constraints = {
+ .list = PLIST_HEAD_INIT(cluster1_freq_min_constraints.list),
+ .target_value = PM_QOS_CLUSTER1_FREQ_MIN_DEFAULT_VALUE,
+ .default_value = PM_QOS_CLUSTER1_FREQ_MIN_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &cluster1_freq_min_notifier,
+};
+static struct pm_qos_object cluster1_freq_min_pm_qos = {
+ .constraints = &cluster1_freq_min_constraints,
+ .name = "cluster1_freq_min",
+};
+
+static BLOCKING_NOTIFIER_HEAD(cluster1_freq_max_notifier);
+static struct pm_qos_constraints cluster1_freq_max_constraints = {
+ .list = PLIST_HEAD_INIT(cluster1_freq_max_constraints.list),
+ .target_value = PM_QOS_CLUSTER1_FREQ_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_CLUSTER1_FREQ_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &cluster1_freq_max_notifier,
+};
+static struct pm_qos_object cluster1_freq_max_pm_qos = {
+ .constraints = &cluster1_freq_max_constraints,
+ .name = "cluster1_freq_max",
+};
+
+static BLOCKING_NOTIFIER_HEAD(cluster0_freq_min_notifier);
+static struct pm_qos_constraints cluster0_freq_min_constraints = {
+ .list = PLIST_HEAD_INIT(cluster0_freq_min_constraints.list),
+ .target_value = PM_QOS_CLUSTER0_FREQ_MIN_DEFAULT_VALUE,
+ .default_value = PM_QOS_CLUSTER0_FREQ_MIN_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &cluster0_freq_min_notifier,
+};
+static struct pm_qos_object cluster0_freq_min_pm_qos = {
+ .constraints = &cluster0_freq_min_constraints,
+ .name = "cluster0_freq_min",
+};
+
+static BLOCKING_NOTIFIER_HEAD(cluster0_freq_max_notifier);
+static struct pm_qos_constraints cluster0_freq_max_constraints = {
+ .list = PLIST_HEAD_INIT(cluster0_freq_max_constraints.list),
+ .target_value = PM_QOS_CLUSTER0_FREQ_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_CLUSTER0_FREQ_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &cluster0_freq_max_notifier,
+};
+static struct pm_qos_object cluster0_freq_max_pm_qos = {
+ .constraints = &cluster0_freq_max_constraints,
+ .name = "cluster0_freq_max",
+};
+
+static BLOCKING_NOTIFIER_HEAD(display_throughput_notifier);
+static struct pm_qos_constraints display_tput_constraints = {
+ .list = PLIST_HEAD_INIT(display_tput_constraints.list),
+ .target_value = PM_QOS_DISPLAY_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_DISPLAY_THROUGHPUT_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &display_throughput_notifier,
+};
+static struct pm_qos_object display_throughput_pm_qos = {
+ .constraints = &display_tput_constraints,
+ .name = "display_throughput",
+};
+
+#ifdef CONFIG_ARM_EXYNOS_DEVFREQ_DEBUG
+static BLOCKING_NOTIFIER_HEAD(display_throughput_max_notifier);
+static struct pm_qos_constraints display_tput_max_constraints = {
+ .list = PLIST_HEAD_INIT(display_tput_max_constraints.list),
+ .target_value = PM_QOS_DISPLAY_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_DISPLAY_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &display_throughput_max_notifier,
+};
+static struct pm_qos_object display_throughput_max_pm_qos = {
+ .constraints = &display_tput_max_constraints,
+ .name = "display_throughput_max",
+};
+#endif
+
+static BLOCKING_NOTIFIER_HEAD(cam_throughput_notifier);
+static struct pm_qos_constraints cam_tput_constraints = {
+ .list = PLIST_HEAD_INIT(cam_tput_constraints.list),
+ .target_value = PM_QOS_CAM_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_CAM_THROUGHPUT_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &cam_throughput_notifier,
+};
+static struct pm_qos_object cam_throughput_pm_qos = {
+ .constraints = &cam_tput_constraints,
+ .name = "cam_throughput",
+};
+
+static BLOCKING_NOTIFIER_HEAD(aud_throughput_notifier);
+static struct pm_qos_constraints aud_tput_constraints = {
+ .list = PLIST_HEAD_INIT(aud_tput_constraints.list),
+ .target_value = PM_QOS_AUD_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_AUD_THROUGHPUT_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &aud_throughput_notifier,
+};
+static struct pm_qos_object aud_throughput_pm_qos = {
+ .constraints = &aud_tput_constraints,
+ .name = "aud_throughput",
+};
+
+static BLOCKING_NOTIFIER_HEAD(fsys_throughput_notifier);
+static struct pm_qos_constraints fsys_tput_constraints = {
+ .list = PLIST_HEAD_INIT(fsys_tput_constraints.list),
+ .target_value = PM_QOS_FSYS_THROUGHPUT_DEFAULT_VALUE,
+ .default_value = PM_QOS_FSYS_THROUGHPUT_DEFAULT_VALUE,
+ .type = PM_QOS_MAX,
+ .notifiers = &fsys_throughput_notifier,
+};
+static struct pm_qos_object fsys_throughput_pm_qos = {
+ .constraints = &fsys_tput_constraints,
+ .name = "fsys_throughput",
+};
+
+#ifdef CONFIG_ARM_EXYNOS_DEVFREQ_DEBUG
+static BLOCKING_NOTIFIER_HEAD(cam_throughput_max_notifier);
+static struct pm_qos_constraints cam_tput_max_constraints = {
+ .list = PLIST_HEAD_INIT(cam_tput_max_constraints.list),
+ .target_value = PM_QOS_CAM_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_CAM_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &cam_throughput_max_notifier,
+};
+static struct pm_qos_object cam_throughput_max_pm_qos = {
+ .constraints = &cam_tput_max_constraints,
+ .name = "cam_throughput_max",
+};
+
+static BLOCKING_NOTIFIER_HEAD(aud_throughput_max_notifier);
+static struct pm_qos_constraints aud_tput_max_constraints = {
+ .list = PLIST_HEAD_INIT(aud_tput_max_constraints.list),
+ .target_value = PM_QOS_AUD_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_AUD_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &aud_throughput_max_notifier,
+};
+static struct pm_qos_object aud_throughput_max_pm_qos = {
+ .constraints = &aud_tput_max_constraints,
+ .name = "aud_throughput_max",
+};
+
+static BLOCKING_NOTIFIER_HEAD(fsys_throughput_max_notifier);
+static struct pm_qos_constraints fsys_tput_max_constraints = {
+ .list = PLIST_HEAD_INIT(fsys_tput_max_constraints.list),
+ .target_value = PM_QOS_FSYS_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .default_value = PM_QOS_FSYS_THROUGHPUT_MAX_DEFAULT_VALUE,
+ .type = PM_QOS_MIN,
+ .notifiers = &fsys_throughput_max_notifier,
+};
+static struct pm_qos_object fsys_throughput_max_pm_qos = {
+ .constraints = &fsys_tput_max_constraints,
+ .name = "fsys_throughput_max",
+};
+#endif
static struct pm_qos_object *pm_qos_array[] = {
&null_pm_qos,
&cpu_dma_pm_qos,
&network_lat_pm_qos,
+ &cluster0_freq_min_pm_qos,
+ &cluster0_freq_max_pm_qos,
+ &cluster1_freq_min_pm_qos,
+ &cluster1_freq_max_pm_qos,
+ &device_throughput_pm_qos,
+ &intcam_throughput_pm_qos,
+#ifdef CONFIG_ARM_EXYNOS_DEVFREQ_DEBUG
+ &device_throughput_max_pm_qos,
+ &intcam_throughput_max_pm_qos,
+#endif
+ &bus_throughput_pm_qos,
+ &bus_throughput_max_pm_qos,
&network_throughput_pm_qos,
&memory_bandwidth_pm_qos,
+ &cpu_online_min_pm_qos,
+ &cpu_online_max_pm_qos,
+ &display_throughput_pm_qos,
+#ifdef CONFIG_ARM_EXYNOS_DEVFREQ_DEBUG
+ &display_throughput_max_pm_qos,
+#endif
+ &cam_throughput_pm_qos,
+ &aud_throughput_pm_qos,
+ &fsys_throughput_pm_qos,
+#ifdef CONFIG_ARM_EXYNOS_DEVFREQ_DEBUG
+ &cam_throughput_max_pm_qos,
+ &aud_throughput_max_pm_qos,
+ &fsys_throughput_max_pm_qos,
+#endif
};
static ssize_t pm_qos_power_write(struct file *filp, const char __user *buf,
@@ -159,6 +451,7 @@
return plist_first(&c->list)->prio;
case PM_QOS_MAX:
+ case PM_QOS_FORCE_MAX:
return plist_last(&c->list)->prio;
case PM_QOS_SUM:
@@ -234,8 +527,10 @@
state = "Active";
}
tot_reqs++;
- seq_printf(s, "%d: %d: %s\n", tot_reqs,
- (req->node).prio, state);
+ seq_printf(s, "%d: %d: %s(%s:%d)\n", tot_reqs,
+ (req->node).prio, state,
+ req->func,
+ req->line);
}
seq_printf(s, "Type=%s, Value=%d, Requests: active=%d / total=%d\n",
@@ -271,13 +566,15 @@
* otherwise.
*/
int pm_qos_update_target(struct pm_qos_constraints *c, struct plist_node *node,
- enum pm_qos_req_action action, int value)
+ enum pm_qos_req_action action, int value, void *notify_param)
{
unsigned long flags;
int prev_value, curr_value, new_value;
+ struct pm_qos_request *req;
int ret;
spin_lock_irqsave(&pm_qos_lock, flags);
+
prev_value = pm_qos_get_value(c);
if (value == PM_QOS_DEFAULT_VALUE)
new_value = c->default_value;
@@ -310,12 +607,28 @@
spin_unlock_irqrestore(&pm_qos_lock, flags);
trace_pm_qos_update_target(action, prev_value, curr_value);
+
+ /*
+ * send class of PM QoS request when notify_param is null.
+ */
+ if (!notify_param) {
+ req = container_of(node, struct pm_qos_request, node);
+ notify_param = (void *)(&req->pm_qos_class);
+ }
+
+ if (c->type == PM_QOS_FORCE_MAX) {
+ blocking_notifier_call_chain(c->notifiers,
+ (unsigned long)curr_value,
+ notify_param);
+ return 1;
+ }
+
if (prev_value != curr_value) {
ret = 1;
if (c->notifiers)
blocking_notifier_call_chain(c->notifiers,
(unsigned long)curr_value,
- NULL);
+ notify_param);
} else {
ret = 0;
}
@@ -323,6 +636,50 @@
}
/**
+ * pm_qos_update_constraints - update new constraints attributes
+ * @pm_qos_class: identification of which qos value is requested
+ * @constraints: new constraints data struct
+ *
+ * This function updates new constraints attributes.
+ */
+int pm_qos_update_constraints(int pm_qos_class,
+ struct pm_qos_constraints *constraints)
+{
+ struct pm_qos_constraints *r_constraints;
+ int ret = -EINVAL;
+ int i;
+
+ if (!constraints) {
+ printk(KERN_ERR "%s: invalid constraints\n",
+ __func__);
+ return ret;
+ }
+
+ for (i = 1; i < PM_QOS_NUM_CLASSES; i++) {
+ if (i != pm_qos_class)
+ continue;
+
+ r_constraints = pm_qos_array[i]->constraints;
+
+ if (constraints->target_value)
+ r_constraints->target_value = constraints->target_value;
+ if (constraints->default_value)
+ r_constraints->default_value = constraints->default_value;
+ if (constraints->type)
+ r_constraints->type = constraints->type;
+ if (constraints->notifiers)
+ r_constraints->notifiers = constraints->notifiers;
+
+ return 0;
+ }
+
+ printk(KERN_ERR "%s: no search PM QoS CLASS(%d)\n",
+ __func__, pm_qos_class);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(pm_qos_update_constraints);
+
+/**
* pm_qos_flags_remove_req - Remove device PM QoS flags request.
* @pqf: Device PM QoS flags set to remove the request from.
* @req: Request to remove from the set.
@@ -387,6 +744,33 @@
}
/**
+ * pm_qos_read_req_value - returns requested qos value
+ * @pm_qos_class: identification of which qos value is requested
+ * @req: request wanted to find set value
+ *
+ * This function returns the requested qos value by sysfs node.
+ */
+int pm_qos_read_req_value(int pm_qos_class, struct pm_qos_request *req)
+{
+ struct plist_node *p;
+ unsigned long flags;
+
+ spin_lock_irqsave(&pm_qos_lock, flags);
+
+ plist_for_each(p, &pm_qos_array[pm_qos_class]->constraints->list) {
+ if (req == container_of(p, struct pm_qos_request, node)) {
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+ return p->prio;
+ }
+ }
+
+ spin_unlock_irqrestore(&pm_qos_lock, flags);
+
+ return -ENODATA;
+}
+EXPORT_SYMBOL_GPL(pm_qos_read_req_value);
+
+/**
* pm_qos_request - returns current system wide qos expectation
* @pm_qos_class: identification of which qos value is requested
*
@@ -405,14 +789,14 @@
EXPORT_SYMBOL_GPL(pm_qos_request_active);
static void __pm_qos_update_request(struct pm_qos_request *req,
- s32 new_value)
+ s32 new_value, void *notify_param)
{
trace_pm_qos_update_request(req->pm_qos_class, new_value);
if (new_value != req->node.prio)
pm_qos_update_target(
pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
+ &req->node, PM_QOS_UPDATE_REQ, new_value, notify_param);
}
/**
@@ -427,7 +811,7 @@
struct pm_qos_request,
work);
- __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE);
+ __pm_qos_update_request(req, PM_QOS_DEFAULT_VALUE, NULL);
}
/**
@@ -443,7 +827,8 @@
* removal.
*/
-void pm_qos_add_request(struct pm_qos_request *req,
+void pm_qos_add_request_trace(char *func, unsigned int line,
+ struct pm_qos_request *req,
int pm_qos_class, s32 value)
{
if (!req) /*guard against callers passing in null */
@@ -454,12 +839,14 @@
return;
}
req->pm_qos_class = pm_qos_class;
+ req->func = func;
+ req->line = line;
INIT_DELAYED_WORK(&req->work, pm_qos_work_fn);
trace_pm_qos_add_request(pm_qos_class, value);
pm_qos_update_target(pm_qos_array[pm_qos_class]->constraints,
- &req->node, PM_QOS_ADD_REQ, value);
+ &req->node, PM_QOS_ADD_REQ, value, NULL);
}
-EXPORT_SYMBOL_GPL(pm_qos_add_request);
+EXPORT_SYMBOL_GPL(pm_qos_add_request_trace);
/**
* pm_qos_update_request - modifies an existing qos request
@@ -482,12 +869,43 @@
return;
}
- cancel_delayed_work_sync(&req->work);
- __pm_qos_update_request(req, new_value);
+ if (delayed_work_pending(&req->work))
+ cancel_delayed_work_sync(&req->work);
+
+ __pm_qos_update_request(req, new_value, NULL);
}
EXPORT_SYMBOL_GPL(pm_qos_update_request);
/**
+ * pm_qos_update_request_param - modifies an existing qos request
+ * @req : handle to list element holding a pm_qos request to use
+ * @value: defines the qos request
+ * @notify_param: notifier parameter
+ *
+ * Updates an existing qos request for the pm_qos_class of parameters along
+ * with updating the target pm_qos_class value.
+ *
+ * Attempts are made to make this code callable on hot code paths.
+ */
+void pm_qos_update_request_param(struct pm_qos_request *req,
+ s32 new_value, void *notify_param)
+{
+ if (!req) /*guard against callers passing in null */
+ return;
+
+ if (!pm_qos_request_active(req)) {
+ WARN(1, KERN_ERR "pm_qos_update_request() called for unknown object\n");
+ return;
+ }
+
+ if (delayed_work_pending(&req->work))
+ cancel_delayed_work_sync(&req->work);
+
+ __pm_qos_update_request(req, new_value, notify_param);
+}
+EXPORT_SYMBOL_GPL(pm_qos_update_request_param);
+
+/**
* pm_qos_update_request_timeout - modifies an existing qos request temporarily.
* @req : handle to list element holding a pm_qos request to use
* @new_value: defines the temporal qos request
@@ -504,14 +922,15 @@
"%s called for unknown object.", __func__))
return;
- cancel_delayed_work_sync(&req->work);
+ if (delayed_work_pending(&req->work))
+ cancel_delayed_work_sync(&req->work);
trace_pm_qos_update_request_timeout(req->pm_qos_class,
new_value, timeout_us);
if (new_value != req->node.prio)
pm_qos_update_target(
pm_qos_array[req->pm_qos_class]->constraints,
- &req->node, PM_QOS_UPDATE_REQ, new_value);
+ &req->node, PM_QOS_UPDATE_REQ, new_value, NULL);
schedule_delayed_work(&req->work, usecs_to_jiffies(timeout_us));
}
@@ -535,12 +954,13 @@
return;
}
- cancel_delayed_work_sync(&req->work);
+ if (delayed_work_pending(&req->work))
+ cancel_delayed_work_sync(&req->work);
trace_pm_qos_remove_request(req->pm_qos_class, PM_QOS_DEFAULT_VALUE);
pm_qos_update_target(pm_qos_array[req->pm_qos_class]->constraints,
&req->node, PM_QOS_REMOVE_REQ,
- PM_QOS_DEFAULT_VALUE);
+ PM_QOS_DEFAULT_VALUE, NULL);
memset(req, 0, sizeof(*req));
}
EXPORT_SYMBOL_GPL(pm_qos_remove_request);
@@ -685,7 +1105,6 @@
return count;
}
-
static int __init pm_qos_power_init(void)
{
int ret = 0;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
old mode 100644
new mode 100755
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
old mode 100644
new mode 100755
index f9fe133..5c6085b
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -26,9 +26,11 @@
#include <linux/suspend.h>
#include <linux/syscore_ops.h>
#include <linux/ftrace.h>
+#include <linux/rtc.h>
#include <trace/events/power.h>
#include <linux/compiler.h>
#include <linux/moduleparam.h>
+#include <linux/wakeup_reason.h>
#include "power.h"
@@ -266,16 +268,51 @@
*/
static int suspend_prepare(suspend_state_t state)
{
- int error;
+ int error, nr_calls = 0;
if (!sleep_state_supported(state))
return -EPERM;
pm_prepare_console();
- error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
- if (error)
+ error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
+ if (error) {
+#ifdef CONFIG_SEC_PM_DEBUG
+ void *callback;
+
+ callback = pm_notifier_call_chain_get_callback(nr_calls - 1);
+
+ if (IS_ERR(callback)) {
+ pr_info("PM_SUSPEND_PREPARE failed: %d\n",
+ nr_calls);
+ log_suspend_abort_reason("PM_SUSPEND_PREPARE failed: "
+ "%d", nr_calls);
+ } else {
+ pr_info("PM_SUSPEND_PREPARE failed: %d (%ps)\n",
+ nr_calls, callback);
+ log_suspend_abort_reason("PM_SUSPEND_PREPARE failed: "
+ "%ps (%d)", callback, nr_calls);
+ }
+#endif /* CONFIG_SEC_PM_DEBUG */
+ nr_calls--;
goto Finish;
+ }
+
+#ifndef CONFIG_SUSPEND_SKIP_SYNC
+ trace_suspend_resume(TPS("sync_filesystems"), 0, true);
+ printk(KERN_INFO "PM: Syncing filesystems ... ");
+ if (intr_sync(NULL)) {
+ printk("canceled.\n");
+ trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+ error = -EBUSY;
+#ifdef CONFIG_SEC_PM_DEBUG
+ log_suspend_abort_reason("intr_sync failed");
+#endif /* CONFIG_SEC_PM_DEBUG */
+ goto Finish;
+ }
+ printk("done.\n");
+ trace_suspend_resume(TPS("sync_filesystems"), 0, false);
+#endif
trace_suspend_resume(TPS("freeze_processes"), 0, true);
error = suspend_freeze_processes();
@@ -285,8 +322,12 @@
suspend_stats.failed_freeze++;
dpm_save_failed_step(SUSPEND_FREEZE);
+
+#ifdef CONFIG_SEC_PM_DEBUG
+ log_suspend_abort_reason("Freezing processes failed: %d", error);
+#endif /* CONFIG_SEC_PM_DEBUG */
Finish:
- pm_notifier_call_chain(PM_POST_SUSPEND);
+ __pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
pm_restore_console();
return error;
}
@@ -312,7 +353,8 @@
*/
static int suspend_enter(suspend_state_t state, bool *wakeup)
{
- int error;
+ char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+ int error, last_dev;
error = platform_suspend_prepare(state);
if (error)
@@ -320,7 +362,11 @@
error = dpm_suspend_late(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
printk(KERN_ERR "PM: late suspend of devices failed\n");
+ log_suspend_abort_reason("%s device failed to power down",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_finish;
}
error = platform_suspend_prepare_late(state);
@@ -329,7 +375,11 @@
error = dpm_suspend_noirq(PMSG_SUSPEND);
if (error) {
+ last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+ last_dev %= REC_FAILED_NUM;
printk(KERN_ERR "PM: noirq suspend of devices failed\n");
+ log_suspend_abort_reason("noirq suspend of %s device failed",
+ suspend_stats.failed_devs[last_dev]);
goto Platform_early_resume;
}
error = platform_suspend_prepare_noirq(state);
@@ -353,8 +403,10 @@
}
error = disable_nonboot_cpus();
- if (error || suspend_test(TEST_CPUS))
+ if (error || suspend_test(TEST_CPUS)) {
+ log_suspend_abort_reason("Disabling non-boot cpus failed");
goto Enable_cpus;
+ }
arch_suspend_disable_irqs();
BUG_ON(!irqs_disabled());
@@ -370,6 +422,9 @@
state, false);
events_check_enabled = false;
} else if (*wakeup) {
+ pm_get_active_wakeup_sources(suspend_abort,
+ MAX_SUSPEND_ABORT_LEN);
+ log_suspend_abort_reason(suspend_abort);
error = -EBUSY;
}
syscore_resume();
@@ -417,6 +472,7 @@
error = dpm_suspend_start(PMSG_SUSPEND);
if (error) {
pr_err("PM: Some devices failed to suspend, or early wake event detected\n");
+ log_suspend_abort_reason("Some devices failed to suspend, or early wake event detected");
goto Recover_platform;
}
suspend_test_finish("suspend devices");
@@ -487,14 +543,6 @@
if (state == PM_SUSPEND_FREEZE)
freeze_begin();
-#ifndef CONFIG_SUSPEND_SKIP_SYNC
- trace_suspend_resume(TPS("sync_filesystems"), 0, true);
- printk(KERN_INFO "PM: Syncing filesystems ... ");
- sys_sync();
- printk("done.\n");
- trace_suspend_resume(TPS("sync_filesystems"), 0, false);
-#endif
-
pr_debug("PM: Preparing system for sleep (%s)\n", pm_states[state]);
pm_suspend_clear_flags();
error = suspend_prepare(state);
@@ -518,6 +566,18 @@
return error;
}
+static void pm_suspend_marker(char *annotation)
+{
+ struct timespec ts;
+ struct rtc_time tm;
+
+ getnstimeofday(&ts);
+ rtc_time_to_tm(ts.tv_sec, &tm);
+ pr_info("PM: suspend %s %d-%02d-%02d %02d:%02d:%02d.%09lu UTC\n",
+ annotation, tm.tm_year + 1900, tm.tm_mon + 1, tm.tm_mday,
+ tm.tm_hour, tm.tm_min, tm.tm_sec, ts.tv_nsec);
+}
+
/**
* pm_suspend - Externally visible function for suspending the system.
* @state: System sleep state to enter.
@@ -532,6 +592,7 @@
if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
return -EINVAL;
+ pm_suspend_marker("entry");
error = enter_state(state);
if (error) {
suspend_stats.fail++;
@@ -539,6 +600,7 @@
} else {
suspend_stats.success++;
}
+ pm_suspend_marker("exit");
return error;
}
EXPORT_SYMBOL(pm_suspend);
diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
old mode 100644
new mode 100755
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
old mode 100644
new mode 100755
diff --git a/kernel/power/user.c b/kernel/power/user.c
old mode 100644
new mode 100755
index f83c187..bc6dde1
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -47,7 +47,7 @@
static int snapshot_open(struct inode *inode, struct file *filp)
{
struct snapshot_data *data;
- int error;
+ int error, nr_calls = 0;
if (!hibernation_available())
return -EPERM;
@@ -74,9 +74,9 @@
swap_type_of(swsusp_resume_device, 0, NULL) : -1;
data->mode = O_RDONLY;
data->free_bitmaps = false;
- error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+ error = __pm_notifier_call_chain(PM_HIBERNATION_PREPARE, -1, &nr_calls);
if (error)
- pm_notifier_call_chain(PM_POST_HIBERNATION);
+ __pm_notifier_call_chain(PM_POST_HIBERNATION, --nr_calls, NULL);
} else {
/*
* Resuming. We may need to wait for the image device to
@@ -86,13 +86,15 @@
data->swap = -1;
data->mode = O_WRONLY;
- error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+ error = __pm_notifier_call_chain(PM_RESTORE_PREPARE, -1, &nr_calls);
if (!error) {
error = create_basic_memory_bitmaps();
data->free_bitmaps = !error;
- }
+ } else
+ nr_calls--;
+
if (error)
- pm_notifier_call_chain(PM_POST_RESTORE);
+ __pm_notifier_call_chain(PM_POST_RESTORE, nr_calls, NULL);
}
if (error)
atomic_inc(&snapshot_device_available);
diff --git a/kernel/power/wakelock.c b/kernel/power/wakelock.c
old mode 100644
new mode 100755
index 1896386..95703f0
--- a/kernel/power/wakelock.c
+++ b/kernel/power/wakelock.c
@@ -203,9 +203,6 @@
size_t len;
int ret = 0;
- if (!capable(CAP_BLOCK_SUSPEND))
- return -EPERM;
-
while (*str && !isspace(*str))
str++;
@@ -249,9 +246,6 @@
size_t len;
int ret = 0;
- if (!capable(CAP_BLOCK_SUSPEND))
- return -EPERM;
-
len = strlen(buf);
if (!len)
return -EINVAL;
diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c
new file mode 100755
index 0000000..018ce50
--- /dev/null
+++ b/kernel/power/wakeup_reason.c
@@ -0,0 +1,225 @@
+/*
+ * kernel/power/wakeup_reason.c
+ *
+ * Logs the reasons which caused the kernel to resume from
+ * the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/wakeup_reason.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+
+
+#define MAX_WAKEUP_REASON_IRQS 32
+static int irq_list[MAX_WAKEUP_REASON_IRQS];
+static int irqcount;
+static bool suspend_abort;
+static char abort_reason[MAX_SUSPEND_ABORT_LEN];
+static struct kobject *wakeup_reason;
+static DEFINE_SPINLOCK(resume_reason_lock);
+
+static ktime_t last_monotime; /* monotonic time before last suspend */
+static ktime_t curr_monotime; /* monotonic time after last suspend */
+static ktime_t last_stime; /* monotonic boottime offset before last suspend */
+static ktime_t curr_stime; /* monotonic boottime offset after last suspend */
+
+static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr,
+ char *buf)
+{
+ int irq_no, buf_offset = 0;
+ struct irq_desc *desc;
+ spin_lock(&resume_reason_lock);
+ if (suspend_abort) {
+ buf_offset = sprintf(buf, "Abort: %s", abort_reason);
+ } else {
+ for (irq_no = 0; irq_no < irqcount; irq_no++) {
+ desc = irq_to_desc(irq_list[irq_no]);
+ if (desc && desc->action && desc->action->name)
+ buf_offset += sprintf(buf + buf_offset, "%d %s\n",
+ irq_list[irq_no], desc->action->name);
+ else
+ buf_offset += sprintf(buf + buf_offset, "%d\n",
+ irq_list[irq_no]);
+ }
+ }
+ spin_unlock(&resume_reason_lock);
+ return buf_offset;
+}
+
+static ssize_t last_suspend_time_show(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ struct timespec sleep_time;
+ struct timespec total_time;
+ struct timespec suspend_resume_time;
+
+ /*
+ * total_time is calculated from monotonic bootoffsets because
+ * unlike CLOCK_MONOTONIC it include the time spent in suspend state.
+ */
+ total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime));
+
+ /*
+ * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC)
+ * time interval before entering suspend and post suspend.
+ */
+ suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime));
+
+ /* sleep_time = total_time - suspend_resume_time */
+ sleep_time = timespec_sub(total_time, suspend_resume_time);
+
+ /* Export suspend_resume_time and sleep_time in pair here. */
+ return sprintf(buf, "%lu.%09lu %lu.%09lu\n",
+ suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec,
+ sleep_time.tv_sec, sleep_time.tv_nsec);
+}
+
+static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason);
+static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time);
+
+static struct attribute *attrs[] = {
+ &resume_reason.attr,
+ &suspend_time.attr,
+ NULL,
+};
+static struct attribute_group attr_group = {
+ .attrs = attrs,
+};
+
+/*
+ * logs all the wake up reasons to the kernel
+ * stores the irqs to expose them to the userspace via sysfs
+ */
+void log_wakeup_reason(int irq)
+{
+ struct irq_desc *desc;
+ desc = irq_to_desc(irq);
+ if (desc && desc->action && desc->action->name)
+ printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq,
+ desc->action->name);
+ else
+ printk(KERN_INFO "Resume caused by IRQ %d\n", irq);
+
+ spin_lock(&resume_reason_lock);
+ if (irqcount == MAX_WAKEUP_REASON_IRQS) {
+ spin_unlock(&resume_reason_lock);
+ printk(KERN_WARNING "Resume caused by more than %d IRQs\n",
+ MAX_WAKEUP_REASON_IRQS);
+ return;
+ }
+
+ irq_list[irqcount++] = irq;
+ spin_unlock(&resume_reason_lock);
+}
+
+int check_wakeup_reason(int irq)
+{
+ int irq_no;
+ int ret = false;
+
+ spin_lock(&resume_reason_lock);
+ for (irq_no = 0; irq_no < irqcount; irq_no++)
+ if (irq_list[irq_no] == irq) {
+ ret = true;
+ break;
+ }
+ spin_unlock(&resume_reason_lock);
+ return ret;
+}
+
+void log_suspend_abort_reason(const char *fmt, ...)
+{
+ va_list args;
+
+ spin_lock(&resume_reason_lock);
+
+ //Suspend abort reason has already been logged.
+ if (suspend_abort) {
+ spin_unlock(&resume_reason_lock);
+ return;
+ }
+
+ suspend_abort = true;
+ va_start(args, fmt);
+ vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args);
+ va_end(args);
+ spin_unlock(&resume_reason_lock);
+}
+
+/* Detects a suspend and clears all the previous wake up reasons*/
+static int wakeup_reason_pm_event(struct notifier_block *notifier,
+ unsigned long pm_event, void *unused)
+{
+ switch (pm_event) {
+ case PM_SUSPEND_PREPARE:
+ spin_lock(&resume_reason_lock);
+ irqcount = 0;
+ suspend_abort = false;
+ spin_unlock(&resume_reason_lock);
+ /* monotonic time since boot */
+ last_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ last_stime = ktime_get_boottime();
+ break;
+ case PM_POST_SUSPEND:
+ /* monotonic time since boot */
+ curr_monotime = ktime_get();
+ /* monotonic time since boot including the time spent in suspend */
+ curr_stime = ktime_get_boottime();
+ break;
+ default:
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block wakeup_reason_pm_notifier_block = {
+ .notifier_call = wakeup_reason_pm_event,
+};
+
+/* Initializes the sysfs parameter
+ * registers the pm_event notifier
+ */
+int __init wakeup_reason_init(void)
+{
+ int retval;
+
+ retval = register_pm_notifier(&wakeup_reason_pm_notifier_block);
+ if (retval)
+ printk(KERN_WARNING "[%s] failed to register PM notifier %d\n",
+ __func__, retval);
+
+ wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj);
+ if (!wakeup_reason) {
+ printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n",
+ __func__);
+ return 1;
+ }
+ retval = sysfs_create_group(wakeup_reason, &attr_group);
+ if (retval) {
+ kobject_put(wakeup_reason);
+ printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n",
+ __func__, retval);
+ }
+ return 0;
+}
+
+subsys_initcall(wakeup_reason_init);
diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/printk/braille.c b/kernel/printk/braille.c
old mode 100644
new mode 100755
diff --git a/kernel/printk/braille.h b/kernel/printk/braille.h
old mode 100644
new mode 100755
diff --git a/kernel/printk/console_cmdline.h b/kernel/printk/console_cmdline.h
old mode 100644
new mode 100755
diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c
old mode 100644
new mode 100755
index dd689ab..767605f
--- a/kernel/printk/printk.c
+++ b/kernel/printk/printk.c
@@ -46,8 +46,12 @@
#include <linux/utsname.h>
#include <linux/ctype.h>
#include <linux/uio.h>
+#ifdef CONFIG_SEC_DEBUG_AUTO_SUMMARY
+#include <linux/sec_debug.h>
+#endif
#include <asm/uaccess.h>
+#include <asm/cputype.h>
#define CREATE_TRACE_POINTS
#include <trace/events/printk.h>
@@ -55,6 +59,10 @@
#include "console_cmdline.h"
#include "braille.h"
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+extern void printascii(char *);
+#endif
+
int console_printk[4] = {
CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */
MESSAGE_LOGLEVEL_DEFAULT, /* default_message_loglevel */
@@ -232,6 +240,16 @@
u8 facility; /* syslog facility */
u8 flags:5; /* internal record flags */
u8 level:3; /* syslog level */
+#ifdef CONFIG_PRINTK_PROCESS
+ char process[16]; /* process name */
+ pid_t pid; /* process id */
+ u8 cpu; /* cpu id */
+ u8 in_interrupt; /* interrupt context */
+#endif
+#ifdef CONFIG_SEC_DEBUG_AUTO_SUMMARY
+ u8 for_auto_summary;
+ u8 type_auto_summary;
+#endif
};
/*
@@ -266,7 +284,18 @@
static u64 clear_seq;
static u32 clear_idx;
+/* { SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM - the next printk record to read after the last 'clear_knox' command */
+static u64 clear_seq_knox;
+static u32 clear_idx_knox;
+
+#define SYSLOG_ACTION_READ_CLEAR_KNOX 99
+/* } SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
+
+#ifdef CONFIG_PRINTK_PROCESS
+#define PREFIX_MAX 48
+#else
#define PREFIX_MAX 32
+#endif
#define LOG_LINE_MAX (1024 - PREFIX_MAX)
#define LOG_LEVEL(v) ((v) & 0x07)
@@ -393,6 +422,85 @@
return size;
}
+#ifdef CONFIG_PRINTK_PROCESS
+static bool printk_process = 1;
+static size_t print_process(const struct printk_log *msg, char *buf)
+
+{
+ if (!printk_process)
+ return 0;
+
+ if (!buf)
+ return snprintf(NULL, 0, "%c[%1d:%15s:%5d] ", ' ', 0, " ", 0);
+
+ return sprintf(buf, "%c[%1d:%15s:%5d] ",
+ msg->in_interrupt ? 'I' : ' ',
+ msg->cpu,
+ msg->process,
+ msg->pid);
+}
+#else
+static bool printk_process = 0;
+static size_t print_process(const struct printk_log *msg, char *buf)
+{
+ return 0;
+}
+#endif
+module_param_named(process, printk_process, bool, S_IRUGO | S_IWUSR);
+
+#ifdef CONFIG_SEC_DEBUG_AUTO_SUMMARY
+static void (*func_hook_auto_comm)(int type, const char *buf, size_t size);
+void register_set_auto_comm_buf(void (*func)(int type, const char *buf, size_t size))
+{
+ func_hook_auto_comm = func;
+}
+#endif
+
+#ifdef CONFIG_SEC_DEBUG_INIT_LOG
+static void (*func_hook_init_log)(const char *buf, size_t size);
+void register_init_log_hook_func(void (*func)(const char *buf, size_t size))
+{
+ func_hook_init_log = func;
+}
+#endif
+
+#ifdef CONFIG_EXYNOS_SNAPSHOT
+static size_t hook_size;
+static char hook_text[LOG_LINE_MAX + PREFIX_MAX];
+static void (*func_hook_logbuf)(const char *buf, size_t size);
+static size_t msg_print_text(const struct printk_log *msg, enum log_flags prev,
+ bool syslog, char *buf, size_t size);
+void register_hook_logbuf(void (*func)(const char *buf, size_t size))
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&logbuf_lock, flags);
+ /*
+ * In register hooking function, we should check messages already
+ * printed on log_buf. If so, they will be copyied to backup
+ * exynos log buffer
+ * */
+ if (log_first_seq != log_next_seq) {
+ unsigned int step_seq, step_idx, start, end;
+ struct printk_log *msg;
+ start = log_first_seq;
+ end = log_next_seq;
+ step_idx = log_first_idx;
+ for (step_seq = start; step_seq < end; step_seq++) {
+ msg = (struct printk_log *)(log_buf + step_idx);
+ hook_size = msg_print_text(msg, msg->flags,
+ true, hook_text, LOG_LINE_MAX + PREFIX_MAX);
+ func(hook_text, hook_size);
+ step_idx = log_next(step_idx);
+ }
+ }
+ func_hook_logbuf = func;
+ raw_spin_unlock_irqrestore(&logbuf_lock, flags);
+}
+EXPORT_SYMBOL(register_hook_logbuf);
+#endif
+
+
/*
* Define how much of the log buffer we could take at maximum. The value
* must be greater than two. Note that only half of the buffer is available
@@ -462,6 +570,13 @@
memcpy(log_dict(msg), dict, dict_len);
msg->dict_len = dict_len;
msg->facility = facility;
+
+#ifdef CONFIG_SEC_DEBUG_AUTO_SUMMARY
+ msg->for_auto_summary = (level / 10 == 9) ? 1 : 0;
+ msg->type_auto_summary = (level / 10 == 9) ? level - LOGLEVEL_PR_AUTO_BASE : 0;
+ level = (msg->for_auto_summary) ? 0 : level;
+#endif
+
msg->level = level & 7;
msg->flags = flags & 0x1f;
if (ts_nsec > 0)
@@ -471,6 +586,33 @@
memset(log_dict(msg) + dict_len, 0, pad_len);
msg->len = size;
+#ifdef CONFIG_PRINTK_PROCESS
+ if (printk_process) {
+ strncpy(msg->process, current->comm, sizeof(msg->process) - 1);
+ msg->process[sizeof(msg->process) - 1] = 0;
+ msg->pid = task_pid_nr(current);
+ msg->cpu = smp_processor_id();
+ msg->in_interrupt = in_interrupt() ? 1 : 0;
+ }
+#endif
+#ifdef CONFIG_EXYNOS_SNAPSHOT
+ if (func_hook_logbuf) {
+ hook_size = msg_print_text(msg, msg->flags,
+ true, hook_text, LOG_LINE_MAX + PREFIX_MAX);
+ func_hook_logbuf(hook_text, hook_size);
+
+#ifdef CONFIG_SEC_DEBUG_AUTO_SUMMARY
+ if (msg->for_auto_summary && func_hook_auto_comm)
+ func_hook_auto_comm(msg->type_auto_summary, hook_text, hook_size);
+#endif
+
+#ifdef CONFIG_SEC_DEBUG_INIT_LOG
+ if (task_pid_nr(current) == 1 && func_hook_init_log) {
+ func_hook_init_log(hook_text, hook_size);
+ }
+#endif
+ }
+#endif
/* insert message */
log_next_idx += msg->len;
log_next_seq++;
@@ -942,6 +1084,7 @@
if (!new_log_buf_len)
return;
+ set_memsize_kernel_type(MEMSIZE_KERNEL_LOGBUF);
if (early) {
new_log_buf =
memblock_virt_alloc(new_log_buf_len, LOG_ALIGN);
@@ -949,6 +1092,7 @@
new_log_buf = memblock_virt_alloc_nopanic(new_log_buf_len,
LOG_ALIGN);
}
+ set_memsize_kernel_type(MEMSIZE_KERNEL_OTHERS);
if (unlikely(!new_log_buf)) {
pr_err("log_buf_len: %ld bytes not available\n",
@@ -1078,6 +1222,7 @@
}
len += print_time(msg->ts_nsec, buf ? buf + len : NULL);
+ len += print_process(msg, buf ? buf + len : NULL);
return len;
}
@@ -1204,7 +1349,7 @@
return len;
}
-static int syslog_print_all(char __user *buf, int size, bool clear)
+static int syslog_print_all(char __user *buf, int size, bool clear, bool knox)
{
char *text;
int len = 0;
@@ -1219,19 +1364,33 @@
u64 seq;
u32 idx;
enum log_flags prev;
-
- if (clear_seq < log_first_seq) {
- /* messages are gone, move to first available one */
- clear_seq = log_first_seq;
- clear_idx = log_first_idx;
+
+ /* { SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
+ /* messages are gone, move to first available one */
+ if (!knox && clear_seq < log_first_seq) {
+ clear_seq = log_first_seq;
+ clear_idx = log_first_idx;
+ } else if (knox && clear_seq_knox < log_first_seq) {
+ clear_seq_knox = log_first_seq;
+ clear_idx_knox = log_first_idx;
}
+ /* } SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
/*
* Find first record that fits, including all following records,
* into the user-provided buffer for this dump.
*/
- seq = clear_seq;
- idx = clear_idx;
+
+ /* { SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
+ if(!knox) {
+ seq = clear_seq;
+ idx = clear_idx;
+ }else { //MDM edmaudit
+ seq = clear_seq_knox;
+ idx = clear_idx_knox;
+ }
+ /* } SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
+
prev = 0;
while (seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
@@ -1241,10 +1400,18 @@
idx = log_next(idx);
seq++;
}
-
+
+ /* { SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
/* move first record forward until length fits into the buffer */
- seq = clear_seq;
- idx = clear_idx;
+ if(!knox) {
+ seq = clear_seq;
+ idx = clear_idx;
+ } else { // MDM edmaudit
+ seq = clear_seq_knox;
+ idx = clear_idx_knox;
+ }
+ /* } SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
+
prev = 0;
while (len > size && seq < log_next_seq) {
struct printk_log *msg = log_from_idx(idx);
@@ -1289,10 +1456,18 @@
}
}
+ /* { SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
if (clear) {
- clear_seq = log_next_seq;
- clear_idx = log_next_idx;
+ if (!knox) {
+ clear_seq = log_next_seq;
+ clear_idx = log_next_idx;
+ } else { //MDM edmaudit
+ clear_seq_knox = log_next_seq;
+ clear_idx_knox = log_next_idx;
+ }
}
+ /* } SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
+
raw_spin_unlock_irq(&logbuf_lock);
kfree(text);
@@ -1347,11 +1522,11 @@
error = -EFAULT;
goto out;
}
- error = syslog_print_all(buf, len, clear);
+ error = syslog_print_all(buf, len, clear, false);
break;
/* Clear ring buffer */
case SYSLOG_ACTION_CLEAR:
- syslog_print_all(NULL, 0, true);
+ syslog_print_all(NULL, 0, true, false);
break;
/* Disable logging to console */
case SYSLOG_ACTION_CONSOLE_OFF:
@@ -1417,6 +1592,21 @@
case SYSLOG_ACTION_SIZE_BUFFER:
error = log_buf_len;
break;
+ /* { SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM edmaudit Read last kernel messages */
+ case SYSLOG_ACTION_READ_CLEAR_KNOX:
+ error = -EINVAL;
+ if (!buf || len < 0)
+ goto out;
+ error = 0;
+ if (!len)
+ goto out;
+ if (!access_ok(VERIFY_WRITE, buf, len)) {
+ error = -EFAULT;
+ goto out;
+ }
+ error = syslog_print_all(buf, len, /* clear */ true, /* knox */true);
+ break;
+ /* } SecProductFeature_KNOX.SEC_PRODUCT_FEATURE_KNOX_SUPPORT_MDM */
default:
error = -EINVAL;
break;
@@ -1640,6 +1830,8 @@
if (cont.cons == 0 && (console_prev & LOG_NEWLINE)) {
textlen += print_time(cont.ts_nsec, text);
+ *(text+textlen) = ' ';
+ textlen += print_process(NULL, NULL);
size -= textlen;
}
@@ -1674,6 +1866,7 @@
int this_cpu;
int printed_len = 0;
bool in_sched = false;
+
/* cpu currently holding logbuf_lock in this function */
static unsigned int logbuf_cpu = UINT_MAX;
@@ -1746,6 +1939,12 @@
if (level == LOGLEVEL_DEFAULT)
level = kern_level - '0';
/* fallthrough */
+#ifdef CONFIG_SEC_DEBUG_AUTO_SUMMARY
+ case 'B' ... 'J':
+ if (level == LOGLEVEL_DEFAULT)
+ level = LOGLEVEL_PR_AUTO_BASE + (kern_level - 'A'); /* 91 ~ 99 */
+ /* fallthrough */
+#endif
case 'd': /* KERN_DEFAULT */
lflags |= LOG_PREFIX;
}
@@ -1759,6 +1958,10 @@
}
}
+#ifdef CONFIG_EARLY_PRINTK_DIRECT
+ printascii(text);
+#endif
+
if (level == LOGLEVEL_DEFAULT)
level = default_message_loglevel;
@@ -2117,6 +2320,20 @@
}
/**
+ * console_flush - flush dmesg if console isn't suspended
+ *
+ * console_unlock always flushes the dmesg buffer, so just try to
+ * grab&drop the console lock. If that fails we know that the current
+ * holder will eventually drop the console lock and so flush the dmesg
+ * buffers at the earliest possible time.
+ */
+void console_flush(void)
+{
+ if (console_trylock())
+ console_unlock();
+}
+
+/**
* console_cpu_notify - print deferred console messages after CPU hotplug
* @self: notifier struct
* @action: CPU hotplug event
@@ -2135,8 +2352,7 @@
case CPU_DEAD:
case CPU_DOWN_FAILED:
case CPU_UP_CANCELED:
- console_lock();
- console_unlock();
+ console_flush();
}
return NOTIFY_OK;
}
@@ -2239,6 +2455,7 @@
unsigned long flags;
bool wake_klogd = false;
bool do_cond_resched, retry;
+ u64 next_seq_in_this_turn;
if (console_suspended) {
up_console_sem();
@@ -2261,6 +2478,7 @@
/* flush buffered message fragment immediately to console */
console_cont_flush(text, sizeof(text));
again:
+ next_seq_in_this_turn = log_next_seq;
for (;;) {
struct printk_log *msg;
size_t ext_len = 0;
@@ -2285,7 +2503,7 @@
len = 0;
}
skip:
- if (console_seq == log_next_seq)
+ if (console_seq >= next_seq_in_this_turn)
break;
msg = log_from_idx(console_idx);
@@ -3149,12 +3367,22 @@
*/
void dump_stack_print_info(const char *log_lvl)
{
+#ifdef CONFIG_ARM64
+ printk("%sCPU: %d MPIDR: %llx PID: %d Comm: %.20s %s %s %.*s\n",
+ log_lvl, raw_smp_processor_id(), read_cpuid_mpidr(),
+ current->pid, current->comm,
+ print_tainted(), init_utsname()->release,
+ (int)strcspn(init_utsname()->version, " "),
+ init_utsname()->version);
+
+#else
printk("%sCPU: %d PID: %d Comm: %.20s %s %s %.*s\n",
log_lvl, raw_smp_processor_id(), current->pid, current->comm,
print_tainted(), init_utsname()->release,
(int)strcspn(init_utsname()->version, " "),
init_utsname()->version);
+#endif
if (dump_stack_arch_desc_str[0] != '\0')
printk("%sHardware name: %s\n",
log_lvl, dump_stack_arch_desc_str);
@@ -3173,9 +3401,8 @@
{
dump_stack_print_info(log_lvl);
- printk("%stask: %p ti: %p task.ti: %p\n",
- log_lvl, current, current_thread_info(),
- task_thread_info(current));
+ printk("%stask: %p task.stack: %p\n",
+ log_lvl, current, task_stack_page(current));
}
#endif
diff --git a/kernel/profile.c b/kernel/profile.c
old mode 100644
new mode 100755
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
old mode 100644
new mode 100755
index 5e2cd10..dfc12ad
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -26,6 +26,7 @@
#include <linux/hw_breakpoint.h>
#include <linux/cn_proc.h>
#include <linux/compat.h>
+#include <linux/task_integrity.h>
void __ptrace_link(struct task_struct *child, struct task_struct *new_parent,
@@ -1087,6 +1088,7 @@
long ret;
if (request == PTRACE_TRACEME) {
+ five_ptrace(current, request);
ret = ptrace_traceme();
if (!ret)
arch_ptrace_attach(current);
@@ -1099,6 +1101,8 @@
goto out;
}
+ five_ptrace(child, request);
+
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
/*
@@ -1233,6 +1237,7 @@
long ret;
if (request == PTRACE_TRACEME) {
+ five_ptrace(current, request);
ret = ptrace_traceme();
goto out;
}
@@ -1243,6 +1248,8 @@
goto out;
}
+ five_ptrace(child, request);
+
if (request == PTRACE_ATTACH || request == PTRACE_SEIZE) {
ret = ptrace_attach(child, request, addr, data);
/*
diff --git a/kernel/range.c b/kernel/range.c
old mode 100644
new mode 100755
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile
old mode 100644
new mode 100755
index 61a1656..032b2c0
--- a/kernel/rcu/Makefile
+++ b/kernel/rcu/Makefile
@@ -1,3 +1,7 @@
+# Any varying coverage in these files is non-deterministic
+# and is generally not a function of system call inputs.
+KCOV_INSTRUMENT := n
+
obj-y += update.o sync.o
obj-$(CONFIG_SRCU) += srcu.o
obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
old mode 100644
new mode 100755
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
old mode 100644
new mode 100755
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
old mode 100644
new mode 100755
diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c
old mode 100644
new mode 100755
index be922c9..b49cf3a
--- a/kernel/rcu/sync.c
+++ b/kernel/rcu/sync.c
@@ -68,6 +68,7 @@
RCU_LOCKDEP_WARN(!gp_ops[rsp->gp_type].held(),
"suspicious rcu_sync_is_idle() usage");
}
+EXPORT_SYMBOL_GPL(rcu_sync_lockdep_assert);
#endif
/**
@@ -83,6 +84,18 @@
}
/**
+ * Must be called after rcu_sync_init() and before first use.
+ *
+ * Ensures rcu_sync_is_idle() returns false and rcu_sync_{enter,exit}()
+ * pairs turn into NO-OPs.
+ */
+void rcu_sync_enter_start(struct rcu_sync *rsp)
+{
+ rsp->gp_count++;
+ rsp->gp_state = GP_PASSED;
+}
+
+/**
* rcu_sync_enter() - Force readers onto slowpath
* @rsp: Pointer to rcu_sync structure to use for synchronization
*
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
old mode 100644
new mode 100755
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
old mode 100644
new mode 100755
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
old mode 100644
new mode 100755
index 082aede..96a4ed5
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -56,6 +56,7 @@
#include <linux/random.h>
#include <linux/trace_events.h>
#include <linux/suspend.h>
+#include <linux/exynos-ss.h>
#include "tree.h"
#include "rcu.h"
@@ -1264,7 +1265,9 @@
* See Documentation/RCU/stallwarn.txt for info on how to debug
* RCU CPU stall warnings.
*/
- pr_err("INFO: %s detected stalls on CPUs/tasks:",
+
+ exynos_ss_printkl((size_t)rsp->name, (size_t)rsp);
+ pr_auto(ASL1, "INFO: %s detected stalls on CPUs/tasks:",
rsp->name);
print_cpu_stall_info_begin();
rcu_for_each_leaf_node(rsp, rnp) {
@@ -1325,7 +1328,9 @@
* See Documentation/RCU/stallwarn.txt for info on how to debug
* RCU CPU stall warnings.
*/
- pr_err("INFO: %s self-detected stall on CPU", rsp->name);
+
+ exynos_ss_printkl((size_t)rsp->name, (size_t)rsp);
+ pr_auto(ASL1, "INFO: %s self-detected stall on CPU", rsp->name);
print_cpu_stall_info_begin();
print_cpu_stall_info(rsp, smp_processor_id());
print_cpu_stall_info_end();
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
old mode 100644
new mode 100755
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
old mode 100644
new mode 100755
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
old mode 100644
new mode 100755
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
old mode 100644
new mode 100755
diff --git a/kernel/reboot.c b/kernel/reboot.c
old mode 100644
new mode 100755
index bd30a97..227cf7e
--- a/kernel/reboot.c
+++ b/kernel/reboot.c
@@ -43,6 +43,7 @@
int reboot_cpu;
enum reboot_type reboot_type = BOOT_ACPI;
int reboot_force;
+int ignore_fs_panic = 0; // To prevent kernel panic by EIO during shutdown
/*
* If set, this is used for preparing the system to power off.
@@ -69,7 +70,13 @@
{
blocking_notifier_call_chain(&reboot_notifier_list, SYS_RESTART, cmd);
system_state = SYSTEM_RESTART;
+
+ /* user process should be freezed before device shutdown */
+ events_check_enabled = false;
+ freeze_processes();
+
usermodehelper_disable();
+ ignore_fs_panic = 1;
device_shutdown();
}
@@ -230,7 +237,13 @@
blocking_notifier_call_chain(&reboot_notifier_list,
(state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL);
system_state = state;
+
+ /* user process should be freezed before device shutdown */
+ events_check_enabled = false;
+ freeze_processes();
+
usermodehelper_disable();
+ ignore_fs_panic = 1;
device_shutdown();
}
/**
diff --git a/kernel/relay.c b/kernel/relay.c
old mode 100644
new mode 100755
diff --git a/kernel/resource.c b/kernel/resource.c
old mode 100644
new mode 100755
index 41718cd..73348f5
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -172,7 +172,7 @@
static int __init ioresources_init(void)
{
proc_create("ioports", 0, NULL, &proc_ioports_operations);
- proc_create("iomem", 0, NULL, &proc_iomem_operations);
+ proc_create("iomem", S_IRUSR, NULL, &proc_iomem_operations);
return 0;
}
__initcall(ioresources_init);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
old mode 100644
new mode 100755
index 6768797..9937813
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -2,6 +2,10 @@
CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE)
endif
+# These files are disabled because they produce non-interesting flaky coverage
+# that is not a function of syscall inputs. E.g. involuntary context switches.
+KCOV_INSTRUMENT := n
+
ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
# According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
# needed for x86 only. Why this used to be enabled for all architectures is beyond
@@ -14,8 +18,12 @@
obj-y += core.o loadavg.o clock.o cputime.o
obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o
obj-y += wait.o completion.o idle.o
-obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o
+obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o
obj-$(CONFIG_SCHEDSTATS) += stats.o
obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_SCHED_TUNE) += tune.o
obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
+obj-$(CONFIG_CPU_FREQ) += cpufreq.o
+obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/auto_group.h b/kernel/sched/auto_group.h
old mode 100644
new mode 100755
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
old mode 100644
new mode 100755
index d061895..e89bf27
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -74,6 +74,22 @@
#include <linux/binfmts.h>
#include <linux/context_tracking.h>
#include <linux/compiler.h>
+#include <linux/exynos-ss.h>
+
+#include <linux/sched/sysctl.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/mutex.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+#include <linux/platform_device.h>
+#include <linux/err.h>
+#include <linux/of.h>
+#include <linux/sysfs.h>
+#include <linux/sec_sysfs.h>
+#include <linux/types.h>
+#include <linux/sched/rt.h>
+#include <linux/cpumask.h>
#include <asm/switch_to.h>
#include <asm/tlb.h>
@@ -83,6 +99,10 @@
#include <asm/paravirt.h>
#endif
+#ifdef CONFIG_SEC_DEBUG
+#include <linux/sec_debug.h>
+#endif
+
#include "sched.h"
#include "../workqueue_internal.h"
#include "../smpboot.h"
@@ -90,6 +110,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#define HEAVY_TASK_LOAD_THRESHOLD 1000
+
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
@@ -627,10 +649,7 @@
rcu_read_lock();
for_each_domain(cpu, sd) {
for_each_cpu(i, sched_domain_span(sd)) {
- if (cpu == i)
- continue;
-
- if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
+ if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) {
cpu = i;
goto unlock;
}
@@ -1297,7 +1316,7 @@
if (task_cpu(p) != new_cpu) {
if (p->sched_class->migrate_task_rq)
- p->sched_class->migrate_task_rq(p);
+ p->sched_class->migrate_task_rq(p, new_cpu);
p->se.nr_migrations++;
perf_event_task_migrate(p);
}
@@ -1338,16 +1357,12 @@
struct rq *src_rq, *dst_rq;
int ret = -EAGAIN;
- if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
- return -EAGAIN;
-
src_rq = cpu_rq(arg->src_cpu);
dst_rq = cpu_rq(arg->dst_cpu);
double_raw_lock(&arg->src_task->pi_lock,
&arg->dst_task->pi_lock);
double_rq_lock(src_rq, dst_rq);
-
if (task_cpu(arg->dst_task) != arg->dst_cpu)
goto unlock;
@@ -1381,6 +1396,8 @@
struct migration_swap_arg arg;
int ret = -EINVAL;
+ get_online_cpus();
+
arg = (struct migration_swap_arg){
.src_task = cur,
.src_cpu = task_cpu(cur),
@@ -1391,10 +1408,6 @@
if (arg.src_cpu == arg.dst_cpu)
goto out;
- /*
- * These three tests are all lockless; this is OK since all of them
- * will be re-checked with proper locks held further down the line.
- */
if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
goto out;
@@ -1408,6 +1421,7 @@
ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
out:
+ put_online_cpus();
return ret;
}
@@ -1945,33 +1959,25 @@
success = 1; /* we're going to change ->state */
cpu = task_cpu(p);
- /*
- * Ensure we load p->on_rq _after_ p->state, otherwise it would
- * be possible to, falsely, observe p->on_rq == 0 and get stuck
- * in smp_cond_load_acquire() below.
- *
- * sched_ttwu_pending() try_to_wake_up()
- * [S] p->on_rq = 1; [L] P->state
- * UNLOCK rq->lock -----.
- * \
- * +--- RMB
- * schedule() /
- * LOCK rq->lock -----'
- * UNLOCK rq->lock
- *
- * [task p]
- * [S] p->state = UNINTERRUPTIBLE [L] p->on_rq
- *
- * Pairs with the UNLOCK+LOCK on rq->lock from the
- * last wakeup of our task and the schedule that got our task
- * current.
- */
- smp_rmb();
if (p->on_rq && ttwu_remote(p, wake_flags))
goto stat;
#ifdef CONFIG_SMP
/*
+ * In some case,
+ * target P is on current cpu (on_cpu == 1), but it's not on rq (on_rq != 1).
+ *
+ * P.on_cpu is set to 0 only after finishing schedule(), so this function can be in dead lock
+ * In this case, P should be aborted and skip to wake up.
+ */
+ if (cpu == task_cpu(current)) {
+ if (unlikely(WARN_ON(current == p))) {
+ exynos_ss_printkl((size_t)current, (size_t)current);
+ goto stat;
+ }
+ }
+
+ /*
* Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
* possible to, falsely, observe p->on_cpu == 0.
*
@@ -2126,7 +2132,6 @@
static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
{
p->on_rq = 0;
-
p->se.on_rq = 0;
p->se.exec_start = 0;
p->se.sum_exec_runtime = 0;
@@ -2134,6 +2139,30 @@
p->se.nr_migrations = 0;
p->se.vruntime = 0;
INIT_LIST_HEAD(&p->se.group_node);
+/*
+ * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
+ * removed when useful for applications beyond shares distribution (e.g.
+ * load-balance).
+ */
+#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
+ p->se.avg.period_contrib = 0;
+ p->se.avg.load_sum = 0;
+ p->se.avg.load_avg = 0;
+ p->se.avg.util_sum = 0;
+ p->se.avg.util_avg = 0;
+#ifdef CONFIG_SCHED_HMP
+ p->se.avg.hmp_load_sum = 0;
+ p->se.avg.hmp_load_avg = 0;
+ p->se.avg.hmp_last_up_migration = 0;
+ p->se.avg.hmp_last_down_migration = 0;
+ trace_sched_task_runnable_ratio(p, p->se.avg.hmp_load_avg);
+#endif
+ trace_sched_task_load_contrib(p, p->se.avg.load_avg);
+#endif
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+ p->se.cfs_rq = NULL;
+#endif
#ifdef CONFIG_SCHEDSTATS
memset(&p->se.statistics, 0, sizeof(p->se.statistics));
@@ -2403,6 +2432,12 @@
raw_spin_lock_irqsave(&p->pi_lock, flags);
/* Initialize new task's runnable average */
init_entity_runnable_average(&p->se);
+ init_rt_entity_runnable_average(&p->rt);
+#ifdef CONFIG_SCHED_HMP
+ trace_sched_task_runnable_ratio(p, p->se.avg.hmp_load_avg);
+#endif
+ trace_sched_task_load_contrib(p, p->se.avg.load_avg);
+
#ifdef CONFIG_SMP
/*
* Fork balancing, do it here and not earlier because:
@@ -2710,7 +2745,7 @@
atomic_inc(&oldmm->mm_count);
enter_lazy_tlb(oldmm, next);
} else
- switch_mm_irqs_off(oldmm, mm, next);
+ switch_mm(oldmm, mm, next);
if (!prev->mm) {
prev->active_mm = NULL;
@@ -2903,11 +2938,93 @@
#ifdef CONFIG_SMP
rq->idle_balance = idle_cpu(cpu);
- trigger_load_balance(rq);
+ trigger_load_balance(rq, cpu);
#endif
rq_last_tick_reset(rq);
}
+#ifdef NR_CPUS
+static unsigned int heavy_cpu_count = NR_CPUS;
+#else
+static unsigned int heavy_cpu_count = 8;
+#endif
+
+static ssize_t heavy_task_cpu_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ int count = 0;
+ long unsigned int task_util;
+ long unsigned int cfs_load;
+ long unsigned int no_task;
+ long unsigned int remaining_load;
+ long unsigned int avg_load;
+ int cpu;
+
+ for_each_cpu(cpu, cpu_online_mask)
+ {
+ struct rq *rq = cpu_rq(cpu);
+ struct task_struct *p = rq->curr;
+ task_util = (long unsigned int)p->se.avg.util_avg;
+ cfs_load = (long unsigned int)rq->cfs.runnable_load_avg;
+ no_task = (long unsigned int)rq->cfs.h_nr_running;
+
+ if(task_util > HEAVY_TASK_LOAD_THRESHOLD)
+ {
+ count ++;
+ }
+ else if(task_util <= HEAVY_TASK_LOAD_THRESHOLD && no_task > 1)
+ {
+ remaining_load = cfs_load - task_util;
+ avg_load = remaining_load / (no_task-1);
+ if(avg_load > HEAVY_TASK_LOAD_THRESHOLD)
+ count++;
+ }
+ }
+
+ heavy_cpu_count = count;
+
+ return snprintf(buf, 4, "%d\n", heavy_cpu_count);
+}
+
+static ssize_t heavy_task_cpu_store(struct device *dev,
+ struct device_attribute *attr, const char *buf, size_t size)
+{
+ sscanf(buf, "%d", &heavy_cpu_count);
+
+ return size;
+}
+
+static DEVICE_ATTR(heavy_task_cpu, 0664, heavy_task_cpu_show, heavy_task_cpu_store);
+
+static struct attribute *bench_mark_attributes[] = {
+ &dev_attr_heavy_task_cpu.attr,
+ NULL
+};
+
+static const struct attribute_group bench_mark_attr_group = {
+ .attrs = bench_mark_attributes,
+};
+
+int __init sched_heavy_cpu_init(void)
+{
+ int ret = 0;
+ struct device *dev;
+
+ dev = sec_device_create(NULL, "sec_heavy_cpu");
+
+ if (IS_ERR(dev)) {
+ dev_err(dev, "%s: fail to create sec_dev\n", __func__);
+ return PTR_ERR(dev);
+ }
+ ret = sysfs_create_group(&dev->kobj, &bench_mark_attr_group);
+ if (ret) {
+ dev_err(dev, "failed to create sysfs group\n");
+ }
+
+ return 0;
+}
+late_initcall(sched_heavy_cpu_init);
+
#ifdef CONFIG_NO_HZ_FULL
/**
* scheduler_tick_max_deferment
@@ -3211,6 +3328,7 @@
raw_spin_unlock_irq(&rq->lock);
}
+ exynos_ss_task(cpu, rq->curr);
balance_callback(rq);
}
@@ -3703,6 +3821,10 @@
set_load_weight(p);
}
+#ifdef CONFIG_SCHED_HMP
+extern struct cpumask hmp_slow_cpu_mask;
+#endif
+
/* Actually do priority change: must hold pi & rq lock. */
static void __setscheduler(struct rq *rq, struct task_struct *p,
const struct sched_attr *attr, bool keep_boost)
@@ -3720,9 +3842,13 @@
if (dl_prio(p->prio))
p->sched_class = &dl_sched_class;
- else if (rt_prio(p->prio))
+ else if (rt_prio(p->prio)) {
p->sched_class = &rt_sched_class;
- else
+#ifdef CONFIG_SCHED_HMP
+ if (cpumask_equal(&p->cpus_allowed, cpu_all_mask))
+ do_set_cpus_allowed(p, &hmp_slow_cpu_mask);
+#endif
+ } else
p->sched_class = &fair_sched_class;
}
@@ -4430,11 +4556,13 @@
struct task_struct *p;
int retval;
+ get_online_cpus();
rcu_read_lock();
p = find_process_by_pid(pid);
if (!p) {
rcu_read_unlock();
+ put_online_cpus();
return -ESRCH;
}
@@ -4510,6 +4638,7 @@
free_cpumask_var(cpus_allowed);
out_put_task:
put_task_struct(p);
+ put_online_cpus();
return retval;
}
@@ -4554,6 +4683,7 @@
unsigned long flags;
int retval;
+ get_online_cpus();
rcu_read_lock();
retval = -ESRCH;
@@ -4566,11 +4696,12 @@
goto out_unlock;
raw_spin_lock_irqsave(&p->pi_lock, flags);
- cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
+ cpumask_and(mask, &p->cpus_allowed, cpu_online_mask);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
out_unlock:
rcu_read_unlock();
+ put_online_cpus();
return retval;
}
@@ -4978,16 +5109,14 @@
/*
* reset the NMI-timeout, listing all files on a slow
* console might take a lot of time:
- * Also, reset softlockup watchdogs on all CPUs, because
- * another CPU might be blocked waiting for us to process
- * an IPI.
*/
touch_nmi_watchdog();
- touch_all_softlockup_watchdogs();
if (!state_filter || (p->state & state_filter))
sched_show_task(p);
}
+ touch_all_softlockup_watchdogs();
+
#ifdef CONFIG_SCHED_DEBUG
sysrq_sched_debug_show();
#endif
@@ -5555,6 +5684,7 @@
case CPU_UP_PREPARE:
rq->calc_load_update = calc_load_update;
+ account_reset_rq(rq);
break;
case CPU_ONLINE:
@@ -5897,19 +6027,6 @@
call_rcu_sched(&old_rd->rcu, free_rootdomain);
}
-void sched_get_rd(struct root_domain *rd)
-{
- atomic_inc(&rd->refcount);
-}
-
-void sched_put_rd(struct root_domain *rd)
-{
- if (!atomic_dec_and_test(&rd->refcount))
- return;
-
- call_rcu_sched(&rd->rcu, free_rootdomain);
-}
-
static int init_rootdomain(struct root_domain *rd)
{
memset(rd, 0, sizeof(*rd));
@@ -5923,12 +6040,6 @@
if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL))
goto free_dlo_mask;
-#ifdef HAVE_RT_PUSH_IPI
- rd->rto_cpu = -1;
- raw_spin_lock_init(&rd->rto_lock);
- init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
-#endif
-
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_dlo_mask;
@@ -6143,9 +6254,6 @@
* Build an iteration mask that can exclude certain CPUs from the upwards
* domain traversal.
*
- * Only CPUs that can arrive at this group should be considered to continue
- * balancing.
- *
* Asymmetric node setups can result in situations where the domain tree is of
* unequal depth, make sure to skip domains that already cover the entire
* range.
@@ -6157,31 +6265,18 @@
*/
static void build_group_mask(struct sched_domain *sd, struct sched_group *sg)
{
- const struct cpumask *sg_span = sched_group_cpus(sg);
+ const struct cpumask *span = sched_domain_span(sd);
struct sd_data *sdd = sd->private;
struct sched_domain *sibling;
int i;
- for_each_cpu(i, sg_span) {
+ for_each_cpu(i, span) {
sibling = *per_cpu_ptr(sdd->sd, i);
-
- /*
- * Can happen in the asymmetric case, where these siblings are
- * unused. The mask will not be empty because those CPUs that
- * do have the top domain _should_ span the domain.
- */
- if (!sibling->child)
- continue;
-
- /* If we would not end up here, we can't continue from here */
- if (!cpumask_equal(sg_span, sched_domain_span(sibling->child)))
+ if (!cpumask_test_cpu(i, sched_domain_span(sibling)))
continue;
cpumask_set_cpu(i, sched_group_mask(sg));
}
-
- /* We must not have empty masks here */
- WARN_ON_ONCE(cpumask_empty(sched_group_mask(sg)));
}
/*
@@ -6477,6 +6572,7 @@
* SD_SHARE_PKG_RESOURCES - describes shared caches
* SD_NUMA - describes NUMA topologies
* SD_SHARE_POWERDOMAIN - describes shared power domain
+ * SD_NO_LOAD_BALANCE - describes domain-wise load balance
*
* Odd one out:
* SD_ASYM_PACKING - describes SMT quirks
@@ -6486,7 +6582,8 @@
SD_SHARE_PKG_RESOURCES | \
SD_NUMA | \
SD_ASYM_PACKING | \
- SD_SHARE_POWERDOMAIN)
+ SD_SHARE_POWERDOMAIN | \
+ SD_NO_LOAD_BALANCE)
static struct sched_domain *
sd_init(struct sched_domain_topology_level *tl, int cpu)
@@ -6509,6 +6606,9 @@
"wrong sd_flags in topology description\n"))
sd_flags &= ~TOPOLOGY_SD_FLAGS;
+ if (!(sd_flags & SD_NO_LOAD_BALANCE))
+ sd_flags |= SD_LOAD_BALANCE;
+
*sd = (struct sched_domain){
.min_interval = sd_weight,
.max_interval = 2*sd_weight,
@@ -6522,7 +6622,7 @@
.wake_idx = 0,
.forkexec_idx = 0,
- .flags = 1*SD_LOAD_BALANCE
+ .flags = 0*SD_LOAD_BALANCE
| 1*SD_BALANCE_NEWIDLE
| 1*SD_BALANCE_EXEC
| 1*SD_BALANCE_FORK
@@ -6586,6 +6686,12 @@
return sd;
}
+int __weak cpu_cpu_flags(void)
+{
+ return 0;
+}
+
+
/*
* Topology list, bottom-up.
*/
@@ -6596,7 +6702,7 @@
#ifdef CONFIG_SCHED_MC
{ cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
#endif
- { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+ { cpu_cpu_mask, cpu_cpu_flags, SD_INIT_NAME(DIE) },
{ NULL, },
};
@@ -7307,16 +7413,17 @@
* operation in the resume sequence, just build a single sched
* domain, ignoring cpusets.
*/
- partition_sched_domains(1, NULL, NULL);
- if (--num_cpus_frozen)
+ num_cpus_frozen--;
+ if (likely(num_cpus_frozen)) {
+ partition_sched_domains(1, NULL, NULL);
break;
+ }
/*
* This is the last CPU online operation. So fall through and
* restore the original sched domains by considering the
* cpuset configurations.
*/
- cpuset_force_rebuild();
case CPU_ONLINE:
cpuset_update_active_cpus(true);
@@ -7337,7 +7444,7 @@
int cpus;
switch (action) {
- case CPU_DOWN_PREPARE:
+ case CPU_DOWN_LATE_PREPARE:
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
@@ -7371,17 +7478,14 @@
sched_init_numa();
- /*
- * There's no userspace yet to cause hotplug operations; hence all the
- * cpu masks are stable and all blatant races in the below code cannot
- * happen.
- */
+ get_online_cpus();
mutex_lock(&sched_domains_mutex);
init_sched_domains(cpu_active_mask);
cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
if (cpumask_empty(non_isolated_cpus))
cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
mutex_unlock(&sched_domains_mutex);
+ put_online_cpus();
hotcpu_notifier(sched_domains_numa_masks_update, CPU_PRI_SCHED_ACTIVE);
hotcpu_notifier(cpuset_cpu_active, CPU_PRI_CPUSET_ACTIVE);
@@ -7606,6 +7710,14 @@
return (nested == preempt_offset);
}
+static int __might_sleep_init_called;
+int __init __might_sleep_init(void)
+{
+ __might_sleep_init_called = 1;
+ return 0;
+}
+early_initcall(__might_sleep_init);
+
void __might_sleep(const char *file, int line, int preempt_offset)
{
/*
@@ -7630,8 +7742,10 @@
rcu_sleep_check(); /* WARN_ON_ONCE() by default, no rate limit reqd. */
if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
- !is_idle_task(current)) ||
- system_state != SYSTEM_RUNNING || oops_in_progress)
+ !is_idle_task(current)) || oops_in_progress)
+ return;
+ if (system_state != SYSTEM_RUNNING &&
+ (!__might_sleep_init_called || system_state != SYSTEM_BOOTING))
return;
if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
return;
@@ -7859,7 +7973,7 @@
tg = autogroup_task_group(tsk, tg);
tsk->sched_task_group = tg;
-#ifdef CONFIG_FAIR_GROUP_SCHED
+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_RT_GROUP_SCHED)
if (tsk->sched_class->task_move_group)
tsk->sched_class->task_move_group(tsk);
else
@@ -8270,20 +8384,11 @@
if (IS_ERR(tg))
return ERR_PTR(-ENOMEM);
+ sched_online_group(tg, parent);
+
return &tg->css;
}
-/* Expose task group only after completing cgroup initialization */
-static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
-{
- struct task_group *tg = css_tg(css);
- struct task_group *parent = css_tg(css->parent);
-
- if (parent)
- sched_online_group(tg, parent);
- return 0;
-}
-
static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
{
struct task_group *tg = css_tg(css);
@@ -8658,7 +8763,6 @@
struct cgroup_subsys cpu_cgrp_subsys = {
.css_alloc = cpu_cgroup_css_alloc,
- .css_online = cpu_cgroup_css_online,
.css_released = cpu_cgroup_css_released,
.css_free = cpu_cgroup_css_free,
.fork = cpu_cgroup_fork,
diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h
old mode 100644
new mode 100755
diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/cpudeadline.h b/kernel/sched/cpudeadline.h
old mode 100644
new mode 100755
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
new file mode 100755
index 0000000..1141954
--- /dev/null
+++ b/kernel/sched/cpufreq.c
@@ -0,0 +1,63 @@
+/*
+ * Scheduler code and data structures related to cpufreq.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include "sched.h"
+
+DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_add_update_util_hook - Populate the CPU's update_util_data pointer.
+ * @cpu: The CPU to set the pointer for.
+ * @data: New pointer value.
+ * @func: Callback function to set for the CPU.
+ *
+ * Set and publish the update_util_data pointer for the given CPU.
+ *
+ * The update_util_data pointer of @cpu is set to @data and the callback
+ * function pointer in the target struct update_util_data is set to @func.
+ * That function will be called by cpufreq_update_util() from RCU-sched
+ * read-side critical sections, so it must not sleep. @data will always be
+ * passed to it as the first argument which allows the function to get to the
+ * target update_util_data structure and its container.
+ *
+ * The update_util_data pointer of @cpu must be NULL when this function is
+ * called or it will WARN() and return with no effect.
+ */
+void cpufreq_add_update_util_hook(int cpu, struct update_util_data *data,
+ void (*func)(struct update_util_data *data, u64 time,
+ unsigned long util, unsigned long max))
+{
+ if (WARN_ON(!data || !func))
+ return;
+
+ if (WARN_ON(per_cpu(cpufreq_update_util_data, cpu)))
+ return;
+
+ data->func = func;
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data);
+}
+EXPORT_SYMBOL_GPL(cpufreq_add_update_util_hook);
+
+/**
+ * cpufreq_remove_update_util_hook - Clear the CPU's update_util_data pointer.
+ * @cpu: The CPU to clear the pointer for.
+ *
+ * Clear the update_util_data pointer for the given CPU.
+ *
+ * Callers must use RCU-sched callbacks to free any memory that might be
+ * accessed via the old update_util_data pointer or invoke synchronize_sched()
+ * right after this function to avoid use-after-free.
+ */
+void cpufreq_remove_update_util_hook(int cpu)
+{
+ rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
+}
+EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
new file mode 100755
index 0000000..5664dcf
--- /dev/null
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -0,0 +1,662 @@
+/*
+ * CPUFreq governor based on scheduler-provided CPU utilization data.
+ *
+ * Copyright (C) 2016, Intel Corporation
+ * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#include <linux/cpufreq.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <trace/events/power.h>
+#include <trace/events/sched.h>
+
+#include "sched.h"
+#include "tune.h"
+
+struct sugov_tunables {
+ struct gov_attr_set attr_set;
+ unsigned int rate_limit_us;
+#ifdef CONFIG_FREQVAR_SCHEDTUNE
+ struct freqvar_boost_data freqvar_boost;
+#endif
+};
+
+struct sugov_policy {
+ struct cpufreq_policy *policy;
+
+ struct sugov_tunables *tunables;
+ struct list_head tunables_hook;
+
+ raw_spinlock_t update_lock; /* For shared policies */
+ u64 last_freq_update_time;
+ s64 freq_update_delay_ns;
+ unsigned int next_freq;
+ unsigned int max_util;
+ bool pending;
+
+ /* The next fields are only needed if fast switch cannot be used. */
+ struct irq_work irq_work;
+ struct work_struct work;
+ struct mutex work_lock;
+ bool work_in_progress;
+
+ bool need_freq_update;
+};
+
+struct sugov_cpu {
+ struct update_util_data update_util;
+ struct sugov_policy *sg_policy;
+
+ /* The fields below are only needed when sharing a policy. */
+ unsigned long util;
+ unsigned long max;
+ u64 last_update;
+};
+
+static DEFINE_PER_CPU(struct sugov_cpu, sugov_cpu);
+
+/************************ Governor internals ***********************/
+static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max);
+
+static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
+{
+ s64 delta_ns;
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, smp_processor_id());
+ bool up_scale = (sg_cpu->util > sg_policy->max_util);
+
+ if (up_scale) {
+ if (!sg_policy->work_in_progress)
+ return true;
+ else {
+ sg_policy->next_freq = sugov_next_freq_shared(sg_policy,
+ sg_cpu->util, sg_cpu->max);
+ sg_policy->pending = true;
+ return false;
+ }
+ }
+
+ if (sg_policy->work_in_progress)
+ return false;
+
+ if (unlikely(sg_policy->need_freq_update)) {
+ sg_policy->need_freq_update = false;
+ /*
+ * This happens when limits change, so forget the previous
+ * next_freq value and force an update.
+ */
+ sg_policy->next_freq = UINT_MAX;
+ return true;
+ }
+
+ delta_ns = time - sg_policy->last_freq_update_time;
+ return delta_ns >= sg_policy->freq_update_delay_ns;
+}
+
+static int sugov_select_scaling_cpu(void)
+{
+ int cpu;
+ cpumask_t mask;
+
+ cpumask_clear(&mask);
+ cpumask_and(&mask, cpu_coregroup_mask(0), cpu_online_mask);
+
+ /* Idle core of the boot cluster is selected to scaling cpu */
+ for_each_cpu(cpu, &mask)
+ if (idle_cpu(cpu))
+ return cpu;
+
+ return cpumask_weight(&mask) - 1;
+}
+
+static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
+ unsigned int next_freq)
+{
+ sg_policy->last_freq_update_time = time;
+
+ if (sg_policy->next_freq != next_freq) {
+ sg_policy->next_freq = next_freq;
+ sg_policy->work_in_progress = true;
+ irq_work_queue_on(&sg_policy->irq_work,
+ sugov_select_scaling_cpu());
+ }
+}
+
+/**
+ * get_next_freq - Compute a new frequency for a given cpufreq policy.
+ * @policy: cpufreq policy object to compute the new frequency for.
+ * @util: Current CPU utilization.
+ * @max: CPU capacity.
+ *
+ * If the utilization is frequency-invariant, choose the new frequency to be
+ * proportional to it, that is
+ *
+ * next_freq = C * max_freq * util / max
+ *
+ * Otherwise, approximate the would-be frequency-invariant utilization by
+ * util_raw * (curr_freq / max_freq) which leads to
+ *
+ * next_freq = C * curr_freq * util_raw / max
+ *
+ * Take C = 1.25 for the frequency tipping point at (util / max) = 0.8.
+ */
+static unsigned int get_next_freq(struct cpufreq_policy *policy,
+ unsigned long util, unsigned long max)
+{
+ unsigned int freq = arch_scale_freq_invariant() ?
+ policy->cpuinfo.max_freq : policy->cur;
+
+ return (freq + (freq >> 2)) * util / max;
+}
+
+static void sugov_update_single(struct update_util_data *hook, u64 time,
+ unsigned long util, unsigned long max)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int next_f;
+
+ if (!sugov_should_update_freq(sg_policy, time))
+ return;
+
+ next_f = util == ULONG_MAX ? policy->cpuinfo.max_freq :
+ get_next_freq(policy, util, max);
+
+ trace_sched_freq_commit(time, util, max, next_f);
+
+ sugov_update_commit(sg_policy, time, next_f);
+}
+
+static unsigned int sugov_next_freq_shared(struct sugov_policy *sg_policy,
+ unsigned long util, unsigned long max)
+{
+ struct cpufreq_policy *policy = sg_policy->policy;
+ unsigned int max_f = policy->cpuinfo.max_freq;
+ u64 last_freq_update_time = sg_policy->last_freq_update_time;
+ unsigned int j;
+
+ if (util == ULONG_MAX)
+ goto return_max;
+
+ for_each_cpu(j, policy->cpus) {
+ struct sugov_cpu *j_sg_cpu;
+ unsigned long j_util, j_max;
+ s64 delta_ns;
+
+ if (j == smp_processor_id())
+ continue;
+
+ j_sg_cpu = &per_cpu(sugov_cpu, j);
+ /*
+ * If the CPU utilization was last updated before the previous
+ * frequency update and the time elapsed between the last update
+ * of the CPU utilization and the last frequency update is long
+ * enough, don't take the CPU into account as it probably is
+ * idle now.
+ */
+ delta_ns = last_freq_update_time - j_sg_cpu->last_update;
+ if (delta_ns > TICK_NSEC)
+ continue;
+
+ j_util = j_sg_cpu->util;
+ if (j_util == ULONG_MAX)
+ goto return_max;
+
+ j_max = j_sg_cpu->max;
+ if (j_util * max > j_max * util) {
+ util = j_util;
+ max = j_max;
+ }
+ }
+
+ sg_policy->max_util = util;
+ return get_next_freq(policy, util, max);
+
+return_max:
+ sg_policy->max_util = max;
+ return max_f;
+
+}
+
+static void sugov_update_shared(struct update_util_data *hook, u64 time,
+ unsigned long util, unsigned long max)
+{
+ struct sugov_cpu *sg_cpu = container_of(hook, struct sugov_cpu, update_util);
+ struct sugov_policy *sg_policy = sg_cpu->sg_policy;
+ unsigned int next_f;
+
+ raw_spin_lock(&sg_policy->update_lock);
+
+ sg_cpu->util = util;
+ sg_cpu->max = max;
+ sg_cpu->last_update = time;
+
+ if (sugov_should_update_freq(sg_policy, time)) {
+ next_f = sugov_next_freq_shared(sg_policy, util, max);
+
+ trace_sched_freq_commit(time, util, max, next_f);
+
+ sugov_update_commit(sg_policy, time, next_f);
+ }
+
+ raw_spin_unlock(&sg_policy->update_lock);
+}
+
+static void sugov_work(struct work_struct *work)
+{
+ struct sugov_policy *sg_policy = container_of(work, struct sugov_policy, work);
+
+ mutex_lock(&sg_policy->work_lock);
+
+ __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+ CPUFREQ_RELATION_L);
+ /* if frequency up scaling is in pending, retry scaling */
+ if (sg_policy->pending) {
+ __cpufreq_driver_target(sg_policy->policy, sg_policy->next_freq,
+ CPUFREQ_RELATION_L);
+ sg_policy->pending = false;
+ }
+
+ mutex_unlock(&sg_policy->work_lock);
+
+ sg_policy->work_in_progress = false;
+}
+
+static void sugov_irq_work(struct irq_work *irq_work)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = container_of(irq_work, struct sugov_policy, irq_work);
+ schedule_work_on(smp_processor_id(), &sg_policy->work);
+}
+
+/************************** sysfs interface ************************/
+
+static struct sugov_tunables *global_tunables;
+static DEFINE_MUTEX(global_tunables_lock);
+
+static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr_set)
+{
+ return container_of(attr_set, struct sugov_tunables, attr_set);
+}
+
+static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+ return sprintf(buf, "%u\n", tunables->rate_limit_us);
+}
+
+static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct sugov_policy *sg_policy;
+ unsigned int rate_limit_us;
+
+ if (kstrtouint(buf, 10, &rate_limit_us))
+ return -EINVAL;
+
+ tunables->rate_limit_us = rate_limit_us;
+
+ list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
+ sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+
+ return count;
+}
+#ifdef CONFIG_FREQVAR_SCHEDTUNE
+static unsigned int *get_tokenized_data(const char *buf, int *num_tokens)
+{
+ const char *cp;
+ int i;
+ int ntokens = 1;
+ unsigned int *tokenized_data;
+ int err = -EINVAL;
+
+ cp = buf;
+ while ((cp = strpbrk(cp + 1, " :")))
+ ntokens++;
+
+ if (!(ntokens & 0x1))
+ goto err;
+
+ tokenized_data = kmalloc(ntokens * sizeof(unsigned int), GFP_KERNEL);
+ if (!tokenized_data) {
+ err = -ENOMEM;
+ goto err;
+ }
+
+ cp = buf;
+ i = 0;
+ while (i < ntokens) {
+ if (sscanf(cp, "%u", &tokenized_data[i++]) != 1)
+ goto err_kfree;
+
+ cp = strpbrk(cp, " :");
+ if (!cp)
+ break;
+ cp++;
+ }
+
+ if (i != ntokens)
+ goto err_kfree;
+
+ *num_tokens = ntokens;
+ return tokenized_data;
+
+err_kfree:
+ kfree(tokenized_data);
+err:
+ return ERR_PTR(err);
+}
+
+static ssize_t freqvar_boost_show(struct gov_attr_set *attr_set, char *buf)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct freqvar_boost_data *data = &tunables->freqvar_boost;
+ struct freqvar_boost_table *pos = data->table;
+ int ret = 0;
+
+ for (; pos->frequency != CPUFREQ_TABLE_END; pos++)
+ ret += sprintf(buf + ret, "%8d ratio:%3d \n", pos->frequency,
+ pos->boost / SCHEDTUNE_LOAD_BOOST_UTIT);
+ return ret;
+}
+
+static ssize_t freqvar_boost_store(struct gov_attr_set *attr_set, const char *buf,
+ size_t count)
+{
+ struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+ struct freqvar_boost_data *data = &tunables->freqvar_boost;
+ int *new_table = NULL;
+ int ntokens;
+
+ new_table = get_tokenized_data(buf, &ntokens);
+ if (IS_ERR(new_table))
+ return PTR_RET(new_table);
+
+ schedtune_freqvar_update_table(new_table, ntokens, data->table);
+
+ kfree(new_table);
+
+ return count;
+}
+static struct governor_attr freqvar_boost = __ATTR_RW(freqvar_boost);
+#endif /* CONFIG_FREQVAR_SCHEDTUNE */
+
+static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+
+static struct attribute *sugov_attributes[] = {
+ &rate_limit_us.attr,
+#ifdef CONFIG_FREQVAR_SCHEDTUNE
+ &freqvar_boost.attr,
+#endif
+ NULL
+};
+
+static struct kobj_type sugov_tunables_ktype = {
+ .default_attrs = sugov_attributes,
+ .sysfs_ops = &governor_sysfs_ops,
+};
+
+/********************** cpufreq governor interface *********************/
+
+struct cpufreq_governor schedutil_gov;
+
+static struct sugov_policy *sugov_policy_alloc(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+
+ sg_policy = kzalloc(sizeof(*sg_policy), GFP_KERNEL);
+ if (!sg_policy)
+ return NULL;
+
+ sg_policy->policy = policy;
+ init_irq_work(&sg_policy->irq_work, sugov_irq_work);
+ INIT_WORK(&sg_policy->work, sugov_work);
+ mutex_init(&sg_policy->work_lock);
+ raw_spin_lock_init(&sg_policy->update_lock);
+ return sg_policy;
+}
+
+static void sugov_policy_free(struct sugov_policy *sg_policy)
+{
+ mutex_destroy(&sg_policy->work_lock);
+ kfree(sg_policy);
+}
+
+static struct sugov_tunables *sugov_tunables_alloc(struct sugov_policy *sg_policy)
+{
+ struct sugov_tunables *tunables;
+
+ tunables = kzalloc(sizeof(*tunables), GFP_KERNEL);
+ if (tunables) {
+ gov_attr_set_init(&tunables->attr_set, &sg_policy->tunables_hook);
+ if (!have_governor_per_policy())
+ global_tunables = tunables;
+ }
+ return tunables;
+}
+
+static void sugov_tunables_free(struct sugov_tunables *tunables)
+{
+ if (!have_governor_per_policy())
+ global_tunables = NULL;
+
+ kfree(tunables);
+}
+
+static int sugov_init(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy;
+ struct sugov_tunables *tunables;
+ unsigned int lat;
+ int ret = 0;
+
+ /* State should be equivalent to EXIT */
+ if (policy->governor_data)
+ return -EBUSY;
+
+ sg_policy = sugov_policy_alloc(policy);
+ if (!sg_policy)
+ return -ENOMEM;
+
+ mutex_lock(&global_tunables_lock);
+
+ if (global_tunables) {
+ if (WARN_ON(have_governor_per_policy())) {
+ ret = -EINVAL;
+ goto free_sg_policy;
+ }
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = global_tunables;
+
+ gov_attr_set_get(&global_tunables->attr_set, &sg_policy->tunables_hook);
+ goto out;
+ }
+
+ tunables = sugov_tunables_alloc(sg_policy);
+ if (!tunables) {
+ ret = -ENOMEM;
+ goto free_sg_policy;
+ }
+
+ tunables->rate_limit_us = DEFAULT_LATENCY_MULTIPLIER;
+ lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC;
+ if (lat)
+ tunables->rate_limit_us *= lat;
+
+ /* init freqvar_boost */
+ schedtune_freqvar_boost_init(policy, &tunables->freqvar_boost);
+
+ policy->governor_data = sg_policy;
+ sg_policy->tunables = tunables;
+
+ ret = kobject_init_and_add(&tunables->attr_set.kobj, &sugov_tunables_ktype,
+ get_governor_parent_kobj(policy), "%s",
+ schedutil_gov.name);
+ if (ret)
+ goto fail;
+
+ out:
+ mutex_unlock(&global_tunables_lock);
+
+ return 0;
+
+ fail:
+ policy->governor_data = NULL;
+ schedtune_freqvar_boost_exit(policy, &tunables->freqvar_boost);
+ sugov_tunables_free(tunables);
+
+ free_sg_policy:
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_policy_free(sg_policy);
+ pr_err("initialization failed (error %d)\n", ret);
+ return ret;
+}
+
+static int sugov_exit(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ struct sugov_tunables *tunables = sg_policy->tunables;
+ unsigned int count;
+
+ mutex_lock(&global_tunables_lock);
+
+ count = gov_attr_set_put(&tunables->attr_set, &sg_policy->tunables_hook);
+ policy->governor_data = NULL;
+ if (!count) {
+ schedtune_freqvar_boost_exit(policy, &tunables->freqvar_boost);
+ sugov_tunables_free(tunables);
+ }
+
+ mutex_unlock(&global_tunables_lock);
+
+ sugov_policy_free(sg_policy);
+ return 0;
+}
+
+static int sugov_start(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+ sg_policy->last_freq_update_time = 0;
+ sg_policy->next_freq = UINT_MAX;
+ sg_policy->max_util = 0;
+ sg_policy->pending = false;
+ sg_policy->work_in_progress = false;
+ sg_policy->need_freq_update = false;
+
+ for_each_cpu(cpu, policy->cpus) {
+ struct sugov_cpu *sg_cpu = &per_cpu(sugov_cpu, cpu);
+
+ sg_cpu->sg_policy = sg_policy;
+ if (policy_is_shared(policy)) {
+ sg_cpu->util = 0;
+ sg_cpu->max = 0;
+ sg_cpu->last_update = 0;
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ sugov_update_shared);
+ } else {
+ cpufreq_add_update_util_hook(cpu, &sg_cpu->update_util,
+ sugov_update_single);
+ }
+ }
+ return 0;
+}
+
+static int sugov_stop(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+ unsigned int cpu;
+
+ for_each_cpu(cpu, policy->cpus)
+ cpufreq_remove_update_util_hook(cpu);
+
+ synchronize_sched();
+
+ irq_work_sync(&sg_policy->irq_work);
+ cancel_work_sync(&sg_policy->work);
+ return 0;
+}
+
+static int sugov_limits(struct cpufreq_policy *policy)
+{
+ struct sugov_policy *sg_policy = policy->governor_data;
+
+ mutex_lock(&sg_policy->work_lock);
+
+ if (policy->max < policy->cur)
+ __cpufreq_driver_target(policy, policy->max,
+ CPUFREQ_RELATION_H);
+ else if (policy->min > policy->cur)
+ __cpufreq_driver_target(policy, policy->min,
+ CPUFREQ_RELATION_L);
+
+ mutex_unlock(&sg_policy->work_lock);
+
+ sg_policy->need_freq_update = true;
+ return 0;
+}
+
+int sugov_governor(struct cpufreq_policy *policy, unsigned int event)
+{
+ if (event == CPUFREQ_GOV_POLICY_INIT) {
+ return sugov_init(policy);
+ } else if (policy->governor_data) {
+ switch (event) {
+ case CPUFREQ_GOV_POLICY_EXIT:
+ return sugov_exit(policy);
+ case CPUFREQ_GOV_START:
+ return sugov_start(policy);
+ case CPUFREQ_GOV_STOP:
+ return sugov_stop(policy);
+ case CPUFREQ_GOV_LIMITS:
+ return sugov_limits(policy);
+ }
+ }
+ return -EINVAL;
+}
+
+struct cpufreq_governor schedutil_gov = {
+ .name = "schedutil",
+ .governor = sugov_governor,
+ .owner = THIS_MODULE,
+};
+
+static int __init sugov_module_init(void)
+{
+ return cpufreq_register_governor(&schedutil_gov);
+}
+
+static void __exit sugov_module_exit(void)
+{
+ cpufreq_unregister_governor(&schedutil_gov);
+}
+
+MODULE_AUTHOR("Rafael J. Wysocki <rafael.j.wysocki@intel.com>");
+MODULE_DESCRIPTION("Utilization-based CPU frequency selection");
+MODULE_LICENSE("GPL");
+
+#ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+struct cpufreq_governor *cpufreq_default_governor(void)
+{
+ return &schedutil_gov;
+}
+
+fs_initcall(sugov_module_init);
+#else
+module_init(sugov_module_init);
+#endif
+module_exit(sugov_module_exit);
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
old mode 100644
new mode 100755
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c
old mode 100644
new mode 100755
index a1aecbe..98bcac3
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -1,10 +1,13 @@
+#include <linux/cpufreq.h>
#include <linux/export.h>
#include <linux/sched.h>
#include <linux/tsacct_kern.h>
#include <linux/kernel_stat.h>
#include <linux/static_key.h>
#include <linux/context_tracking.h>
+#include <linux/cpufreq_times.h>
#include "sched.h"
+#include "walt.h"
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
@@ -49,6 +52,10 @@
unsigned long flags;
s64 delta;
int cpu;
+#ifdef CONFIG_SCHED_WALT
+ u64 wallclock;
+ bool account = true;
+#endif
if (!sched_clock_irqtime)
return;
@@ -56,6 +63,9 @@
local_irq_save(flags);
cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+ wallclock = sched_clock_cpu(cpu);
+#endif
delta = sched_clock_cpu(cpu) - __this_cpu_read(irq_start_time);
__this_cpu_add(irq_start_time, delta);
@@ -70,8 +80,16 @@
__this_cpu_add(cpu_hardirq_time, delta);
else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
__this_cpu_add(cpu_softirq_time, delta);
+#ifdef CONFIG_SCHED_WALT
+ else
+ account = false;
+#endif
irq_time_write_end();
+#ifdef CONFIG_SCHED_WALT
+ if (account)
+ walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
local_irq_restore(flags);
}
EXPORT_SYMBOL_GPL(irqtime_account_irq);
@@ -149,6 +167,12 @@
/* Account for user time used */
acct_account_cputime(p);
+
+ /* Account power usage for user time */
+ acct_update_power(p, cputime);
+
+ /* Account power usage for system time */
+ cpufreq_acct_update_power(p, cputime);
}
/*
@@ -199,6 +223,12 @@
/* Account for system time used */
acct_account_cputime(p);
+
+ /* Account power usage for user time */
+ acct_update_power(p, cputime);
+
+ /* Account power usage for system time */
+ cpufreq_acct_update_power(p, cputime);
}
/*
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
old mode 100644
new mode 100755
index e12b0a4..19a38f7
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -18,6 +18,8 @@
#include <linux/slab.h>
+#include "walt.h"
+
struct dl_bandwidth def_dl_bandwidth;
static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -43,6 +45,24 @@
return !RB_EMPTY_NODE(&dl_se->rb_node);
}
+static void add_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ u64 se_bw = dl_se->dl_bw;
+
+ dl_rq->avg_bw += se_bw;
+}
+
+static void clear_average_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
+{
+ u64 se_bw = dl_se->dl_bw;
+
+ dl_rq->avg_bw -= se_bw;
+ if (dl_rq->avg_bw < 0) {
+ WARN_ON(1);
+ dl_rq->avg_bw = 0;
+ }
+}
+
static inline int is_leftmost(struct task_struct *p, struct dl_rq *dl_rq)
{
struct sched_dl_entity *dl_se = &p->dl;
@@ -565,6 +585,9 @@
struct dl_rq *dl_rq = dl_rq_of_se(dl_se);
struct rq *rq = rq_of_dl_rq(dl_rq);
+ if (dl_se->dl_new)
+ add_average_bw(dl_se, dl_rq);
+
/*
* The arrival of a new instance needs special treatment, i.e.,
* the actual scheduling parameters have to be "renewed".
@@ -837,6 +860,10 @@
if (!dl_task(curr) || !on_dl_rq(dl_se))
return;
+ /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_trigger_update(rq_clock(rq));
+
/*
* Consumed budget is computed considering the time as
* observed by schedulable tasks (excluding time spent
@@ -849,6 +876,11 @@
if (unlikely((s64)delta_exec <= 0))
return;
+#ifdef CPU_FREQ_DEFAULT_GOV_SCHEDUTIL
+ /* kick cpufreq (see the comment in kernel/sched/sched.h). */
+ cpufreq_update_this_cpu(rq, SCHED_CPUFREQ_DL);
+#endif
+
schedstat_set(curr->se.statistics.exec_max,
max(curr->se.statistics.exec_max, delta_exec));
@@ -858,8 +890,6 @@
curr->se.exec_start = rq_clock_task(rq);
cpuacct_charge(curr, delta_exec);
- sched_rt_avg_update(rq, delta_exec);
-
dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec;
if (dl_runtime_exceeded(dl_se)) {
dl_se->dl_throttled = 1;
@@ -977,6 +1007,7 @@
WARN_ON(!dl_prio(prio));
dl_rq->dl_nr_running++;
add_nr_running(rq_of_dl_rq(dl_rq), 1);
+ walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
inc_dl_deadline(dl_rq, deadline);
inc_dl_migration(dl_se, dl_rq);
@@ -991,6 +1022,7 @@
WARN_ON(!dl_rq->dl_nr_running);
dl_rq->dl_nr_running--;
sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+ walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
dec_dl_deadline(dl_rq, dl_se->deadline);
dec_dl_migration(dl_se, dl_rq);
@@ -1367,6 +1399,8 @@
static void task_dead_dl(struct task_struct *p)
{
struct dl_bw *dl_b = dl_bw_of(task_cpu(p));
+ struct dl_rq *dl_rq = dl_rq_of_se(&p->dl);
+ struct rq *rq = rq_of_dl_rq(dl_rq);
/*
* Since we are TASK_DEAD we won't slip out of the domain!
@@ -1375,6 +1409,8 @@
/* XXX we should retain the bw until 0-lag */
dl_b->total_bw -= p->dl.dl_bw;
raw_spin_unlock_irq(&dl_b->lock);
+
+ clear_average_bw(&p->dl, &rq->dl);
}
static void set_curr_task_dl(struct rq *rq)
@@ -1569,6 +1605,7 @@
!cpumask_test_cpu(later_rq->cpu,
&task->cpus_allowed) ||
task_running(rq, task) ||
+ !dl_task(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, later_rq);
later_rq = NULL;
@@ -1682,7 +1719,11 @@
}
deactivate_task(rq, next_task, 0);
+ clear_average_bw(&next_task->dl, &rq->dl);
+ next_task->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(next_task, later_rq->cpu);
+ next_task->on_rq = TASK_ON_RQ_QUEUED;
+ add_average_bw(&next_task->dl, &later_rq->dl);
activate_task(later_rq, next_task, 0);
ret = 1;
@@ -1770,7 +1811,11 @@
resched = true;
deactivate_task(src_rq, p, 0);
+ clear_average_bw(&p->dl, &src_rq->dl);
+ p->on_rq = TASK_ON_RQ_MIGRATING;
set_task_cpu(p, this_cpu);
+ p->on_rq = TASK_ON_RQ_QUEUED;
+ add_average_bw(&p->dl, &this_rq->dl);
activate_task(this_rq, p, 0);
dmin = p->dl.deadline;
@@ -1876,6 +1921,8 @@
if (!start_dl_timer(p))
__dl_clear_params(p);
+ clear_average_bw(&p->dl, &rq->dl);
+
/*
* Since this might be the only -deadline task on the rq,
* this is the right place to try to pull some other one
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
old mode 100644
new mode 100755
index 6415117..5b314f2
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -155,7 +155,7 @@
rcu_read_lock();
for_each_process_thread(g, p) {
- if (task_cpu(p) != rq_cpu)
+ if (!p->on_rq || task_cpu(p) != rq_cpu)
continue;
print_task(m, rq, p);
@@ -597,6 +597,32 @@
P(se.statistics.nr_wakeups_affine_attempts);
P(se.statistics.nr_wakeups_passive);
P(se.statistics.nr_wakeups_idle);
+ /* eas */
+ /* select_idle_sibling() */
+ P(se.statistics.nr_wakeups_sis_attempts);
+ P(se.statistics.nr_wakeups_sis_idle);
+ P(se.statistics.nr_wakeups_sis_cache_affine);
+ P(se.statistics.nr_wakeups_sis_suff_cap);
+ P(se.statistics.nr_wakeups_sis_idle_cpu);
+ P(se.statistics.nr_wakeups_sis_count);
+ /* select_energy_cpu_brute() */
+ P(se.statistics.nr_wakeups_secb_attempts);
+ P(se.statistics.nr_wakeups_secb_sync);
+ P(se.statistics.nr_wakeups_secb_idle_bt);
+ P(se.statistics.nr_wakeups_secb_insuff_cap);
+ P(se.statistics.nr_wakeups_secb_no_nrg_sav);
+ P(se.statistics.nr_wakeups_secb_nrg_sav);
+ P(se.statistics.nr_wakeups_secb_count);
+ /* find_best_target() */
+ P(se.statistics.nr_wakeups_fbt_attempts);
+ P(se.statistics.nr_wakeups_fbt_no_cpu);
+ P(se.statistics.nr_wakeups_fbt_no_sd);
+ P(se.statistics.nr_wakeups_fbt_pref_idle);
+ P(se.statistics.nr_wakeups_fbt_count);
+ /* cas */
+ /* select_task_rq_fair() */
+ P(se.statistics.nr_wakeups_cas_attempts);
+ P(se.statistics.nr_wakeups_cas_count);
{
u64 avg_atom, avg_per_cpu;
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c
new file mode 100755
index 0000000..b0656b7
--- /dev/null
+++ b/kernel/sched/energy.c
@@ -0,0 +1,124 @@
+/*
+ * Obtain energy cost data from DT and populate relevant scheduler data
+ * structures.
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program. If not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) "sched-energy: " fmt
+
+#define DEBUG
+
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched_energy.h>
+#include <linux/stddef.h>
+
+struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+static void free_resources(void)
+{
+ int cpu, sd_level;
+ struct sched_group_energy *sge;
+
+ for_each_possible_cpu(cpu) {
+ for_each_possible_sd_level(sd_level) {
+ sge = sge_array[cpu][sd_level];
+ if (sge) {
+ kfree(sge->cap_states);
+ kfree(sge->idle_states);
+ kfree(sge);
+ }
+ }
+ }
+}
+
+void init_sched_energy_costs(void)
+{
+ struct device_node *cn, *cp;
+ struct capacity_state *cap_states;
+ struct idle_state *idle_states;
+ struct sched_group_energy *sge;
+ const struct property *prop;
+ int sd_level, i, nstates, cpu;
+ const __be32 *val;
+
+ for_each_possible_cpu(cpu) {
+ cn = of_get_cpu_node(cpu, NULL);
+ if (!cn) {
+ pr_warn("CPU device node missing for CPU %d\n", cpu);
+ return;
+ }
+
+ if (!of_find_property(cn, "sched-energy-costs", NULL)) {
+ pr_warn("CPU device node has no sched-energy-costs\n");
+ return;
+ }
+
+ for_each_possible_sd_level(sd_level) {
+ cp = of_parse_phandle(cn, "sched-energy-costs", sd_level);
+ if (!cp)
+ break;
+
+ prop = of_find_property(cp, "busy-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No busy-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ sge = kcalloc(1, sizeof(struct sched_group_energy),
+ GFP_NOWAIT);
+
+ nstates = (prop->length / sizeof(u32)) / 2;
+ cap_states = kcalloc(nstates,
+ sizeof(struct capacity_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++) {
+ cap_states[i].cap = be32_to_cpup(val++);
+ cap_states[i].power = be32_to_cpup(val++);
+ }
+
+ sge->nr_cap_states = nstates;
+ sge->cap_states = cap_states;
+
+ prop = of_find_property(cp, "idle-cost-data", NULL);
+ if (!prop || !prop->value) {
+ pr_warn("No idle-cost data, skipping sched_energy init\n");
+ goto out;
+ }
+
+ nstates = (prop->length / sizeof(u32));
+ idle_states = kcalloc(nstates,
+ sizeof(struct idle_state),
+ GFP_NOWAIT);
+
+ for (i = 0, val = prop->value; i < nstates; i++)
+ idle_states[i].power = be32_to_cpup(val++);
+
+ sge->nr_idle_states = nstates;
+ sge->idle_states = idle_states;
+
+ sge_array[cpu][sd_level] = sge;
+ }
+ }
+
+ pr_info("Sched-energy-costs installed from DT\n");
+ return;
+
+out:
+ free_resources();
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
old mode 100644
new mode 100755
index c2af250..b034bf2
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -30,10 +30,30 @@
#include <linux/mempolicy.h>
#include <linux/migrate.h>
#include <linux/task_work.h>
+#include <linux/of.h>
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+#include <linux/cpuset.h>
+#endif
#include <trace/events/sched.h>
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+#include <linux/sysfs.h>
+#include <linux/vmalloc.h>
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Include cpufreq and ipa headers to add a notifier so that cpu
+ * frequency scaling can track the current CPU frequency and limits.
+ */
+#include <linux/cpufreq.h>
+//#include <linux/ipa.h>
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
#include "sched.h"
+#include "tune.h"
+
+#ifdef CONFIG_SCHED_HMP
+LIST_HEAD(hmp_domains);
+#endif
/*
* Targeted preemption latency for CPU-bound tasks:
@@ -671,6 +691,7 @@
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
{
+ struct task_struct *p = current;
struct sched_avg *sa = &se->avg;
sa->last_update_time = 0;
@@ -682,11 +703,21 @@
sa->period_contrib = 1023;
sa->load_avg = scale_load_down(se->load.weight);
sa->load_sum = sa->load_avg * LOAD_AVG_MAX;
- sa->util_avg = scale_load_down(SCHED_LOAD_SCALE);
- sa->util_sum = sa->util_avg * LOAD_AVG_MAX;
+
+ /* The new task inherited a utilization of current */
+ sa->util_avg = p->se.avg.util_avg;
+ sa->util_sum = p->se.avg.util_sum;
+ trace_sched_entity_initial_util(0, sa->util_avg, sa->util_sum);
+
/* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+#ifdef CONFIG_SCHED_HMP
+ sa->hmp_load_avg = p->se.avg.hmp_load_avg;
+ sa->hmp_load_sum = p->se.avg.hmp_load_sum;
+#endif
}
+static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq);
+static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq);
#else
void init_entity_runnable_average(struct sched_entity *se)
{
@@ -1191,6 +1222,8 @@
{
if (env->best_task)
put_task_struct(env->best_task);
+ if (p)
+ get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1258,30 +1291,20 @@
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
- bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task or idle task.
+ * No need to move the exiting task, and this ensures that ->curr
+ * wasn't reaped and thus get_task_struct() in task_numa_assign()
+ * is safe under RCU read lock.
+ * Note that rcu_read_lock() itself can't protect from the final
+ * put_task_struct() after the last schedule().
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
- else {
- /*
- * The task_struct must be protected here to protect the
- * p->numa_faults access in the task_weight since the
- * numa_faults could already be freed in the following path:
- * finish_task_switch()
- * --> put_task_struct()
- * --> __put_task_struct()
- * --> task_numa_free()
- */
- get_task_struct(cur);
- }
-
raw_spin_unlock_irq(&dst_rq->lock);
/*
@@ -1365,7 +1388,6 @@
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
- put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1391,16 +1413,9 @@
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
- assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
- /*
- * The dst_rq->curr isn't assigned. The protection for task_struct is
- * finished.
- */
- if (cur && !assigned)
- put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
@@ -2481,7 +2496,7 @@
* Approximate:
* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
*/
-static __always_inline u64 decay_load(u64 val, u64 n)
+u64 decay_load(u64 val, u64 n)
{
unsigned int local_n;
@@ -2516,7 +2531,7 @@
* We can compute this reasonably efficiently by combining:
* y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
*/
-static u32 __compute_runnable_contrib(u64 n)
+u32 __compute_runnable_contrib(u64 n)
{
u32 contrib = 0;
@@ -2543,6 +2558,306 @@
#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+
+#define HMP_VARIABLE_SCALE_SHIFT 16ULL
+struct hmp_global_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct kobject *kobj,
+ struct attribute *attr, char *buf);
+ ssize_t (*store)(struct kobject *a, struct attribute *b,
+ const char *c, size_t count);
+ int *value;
+ int (*to_sysfs)(int);
+ int (*from_sysfs)(int);
+};
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+#ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+#define HMP_DATA_SYSFS_MAX 23
+#else
+#define HMP_DATA_SYSFS_MAX 22
+#endif
+#else
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+#define HMP_DATA_SYSFS_MAX 17
+#else
+#define HMP_DATA_SYSFS_MAX 16
+#endif
+#endif
+#else
+#ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+#define HMP_DATA_SYSFS_MAX 22
+#else
+#define HMP_DATA_SYSFS_MAX 21
+#endif
+#else
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+#define HMP_DATA_SYSFS_MAX 16
+#else
+#define HMP_DATA_SYSFS_MAX 15
+#endif
+#endif
+#endif
+
+struct hmp_data_struct {
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ int freqinvar_load_scale_enabled;
+#endif
+ int multiplier; /* used to scale the time delta */
+ int semiboost_multiplier;
+ int rq_multiplier;
+ struct attribute_group attr_group;
+ struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
+ struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
+} hmp_data = {.multiplier = 1 << HMP_VARIABLE_SCALE_SHIFT,
+ .semiboost_multiplier = 2 << HMP_VARIABLE_SCALE_SHIFT,
+ .rq_multiplier = 4 << HMP_VARIABLE_SCALE_SHIFT};
+
+static u64 hmp_variable_scale_convert(u64 delta);
+static u64 hmp_rq_variable_scale_convert(u64 delta);
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* Frequency-Invariant Load Modification:
+ * Loads are calculated as in PJT's patch however we also scale the current
+ * contribution in line with the frequency of the CPU that the task was
+ * executed on.
+ * In this version, we use a simple linear scale derived from the maximum
+ * frequency reported by CPUFreq. As an example:
+ *
+ * Consider that we ran a task for 100% of the previous interval.
+ *
+ * Our CPU was under asynchronous frequency control through one of the
+ * CPUFreq governors.
+ *
+ * The CPUFreq governor reports that it is able to scale the CPU between
+ * 500MHz and 1GHz.
+ *
+ * During the period, the CPU was running at 1GHz.
+ *
+ * In this case, our load contribution for that period is calculated as
+ * 1 * (number_of_active_microseconds)
+ *
+ * This results in our task being able to accumulate maximum load as normal.
+ *
+ *
+ * Consider now that our CPU was executing at 500MHz.
+ *
+ * We now scale the load contribution such that it is calculated as
+ * 0.5 * (number_of_active_microseconds)
+ *
+ * Our task can only record 50% maximum load during this period.
+ *
+ * This represents the task consuming 50% of the CPU's *possible* compute
+ * capacity. However the task did consume 100% of the CPU's *available*
+ * compute capacity which is the value seen by the CPUFreq governor and
+ * user-side CPU Utilization tools.
+ *
+ * Restricting tracked load to be scaled by the CPU's frequency accurately
+ * represents the consumption of possible compute capacity and allows the
+ * HMP migration's simple threshold migration strategy to interact more
+ * predictably with CPUFreq's asynchronous compute capacity changes.
+ */
+#define SCHED_FREQSCALE_SHIFT 10
+struct cpufreq_extents {
+ u32 curr_scale;
+ u32 cpufreq_min;
+ u32 cpufreq_max;
+ u32 thermal_min;
+ u32 thermal_max;
+ u32 min;
+ u32 max;
+ u32 flags;
+};
+/* Flag set when the governor in use only allows one frequency.
+ * Disables scaling.
+ */
+#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
+
+static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
+
+unsigned long exynos_scale_freq_capacity(struct sched_domain *sd, int cpu)
+{
+ /* retrieve scale factor for load */
+ if (hmp_data.freqinvar_load_scale_enabled)
+ return freq_scale[cpu].curr_scale;
+ else
+ return SCHED_CAPACITY_SCALE;
+}
+
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Migration thresholds should be in the range [0..1023]
+ * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
+ * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
+ * The default values (512, 256) offer good responsiveness, but may need
+ * tweaking suit particular needs.
+ */
+
+unsigned int hmp_up_threshold = 700;
+unsigned int hmp_down_threshold = 256;
+
+unsigned int hmp_semiboost_up_threshold = 400;
+unsigned int hmp_semiboost_down_threshold = 150;
+
+#if defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+/* Ex: 256 = /4, 512 = /2, 1024 = x1, 1536 = x1.5, 2048 = x2 */
+u64 hmp_up_compst_ratio = 512; /* HMP UP COMPENSATION RATIO */
+u64 hmp_down_compst_ratio = 2048; /* HMP DOWN COMPENSATION RATIO */
+#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+
+unsigned int hmp_packing_enabled = 1;
+unsigned int hmp_packing_threshold = 460; /* 45% of the NICE_0_LOAD */
+
+#ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+#include <linux/pm_qos.h>
+#include <linux/irq_work.h>
+static void hmp_do_tbsoftlanding(int cpu, unsigned long load);
+static void hmp_tbsoftlanding_update_thr(void);
+
+typedef enum {
+ TBSL_LV_HIGH,
+ TBSL_LV_MID,
+ TBSL_LV_LOW,
+ TBSL_LV_END,
+} tbsl_level;
+
+struct tbsl_dat {
+ struct pm_qos_request pm_qos;
+ int freq;
+ int threshold;
+};
+
+static struct {
+ struct workqueue_struct *workqueue;
+ struct work_struct work;
+ struct irq_work irq_work;
+ struct tbsl_dat data[TBSL_LV_END];
+ int enabled;
+ int timeout;
+ int threshold;
+ unsigned int last_lv;
+} hmp_tbsoftlanding;
+#endif
+
+/*
+ * Needed to determine heaviest tasks etc.
+ */
+static inline unsigned int hmp_cpu_is_fastest(int cpu);
+static inline unsigned int hmp_cpu_is_slowest(int cpu);
+static inline struct hmp_domain *hmp_slower_domain(int cpu);
+static inline struct hmp_domain *hmp_faster_domain(int cpu);
+#endif
+
+static inline unsigned long task_util(struct task_struct *p);
+#ifdef CONFIG_SCHED_TUNE
+static unsigned long
+schedtune_margin(unsigned long signal, unsigned long boost)
+{
+ unsigned long long margin = 0;
+
+ /*
+ * Signal proportional compensation (SPC)
+ *
+ * The Boost (B) value is used to compute a Margin (M) which is
+ * proportional to the complement of the original Signal (S):
+ * M = B * (SCHED_LOAD_SCALE - S)
+ * The obtained M could be used by the caller to "boost" S.
+ */
+ margin = SCHED_LOAD_SCALE - signal;
+ margin *= boost;
+
+ /*
+ * Fast integer division by constant:
+ * Constant : (C) = 100
+ * Precision : 0.1% (P) = 0.1
+ * Reference : C * 100 / P (R) = 100000
+ *
+ * Thus:
+ * Shift bits : ceil(log(R,2)) (S) = 17
+ * Mult const : round(2^S/C) (M) = 1311
+ *
+ *
+ */
+ margin *= 1311;
+ margin >>= 17;
+
+ return margin;
+}
+
+static inline unsigned int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ unsigned int boost = 0;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boost += schedtune_cpu_boost(cpu);
+#else
+ boost += get_sysctl_sched_cfs_boost();
+#endif
+ if (boost == 0)
+ return 0;
+
+ return schedtune_margin(util, boost);
+}
+
+static inline unsigned long
+schedtune_task_margin(struct task_struct *task)
+{
+ unsigned int boost;
+ unsigned long util;
+ unsigned long margin;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ boost = schedtune_task_boost(task);
+#else
+ boost = get_sysctl_sched_cfs_boost();
+#endif
+ if (boost == 0)
+ return 0;
+
+ util = task_util(task);
+ margin = schedtune_margin(util, boost);
+
+ return margin;
+}
+#else /* CONFIG_SCHED_TUNE */
+static inline unsigned int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+ return 0;
+}
+
+static inline unsigned int
+schedtune_task_margin(struct task_struct *task)
+{
+ return 0;
+}
+#endif
+
+static inline unsigned long
+boosted_cpu_util(unsigned long util, int cpu)
+{
+ unsigned long margin = schedtune_cpu_margin(util, cpu);
+
+ trace_sched_boost_cpu(cpu, util, margin);
+
+ return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+ unsigned long util = task_util(task);
+ unsigned long margin = schedtune_task_margin(task);
+
+ return util + margin;
+}
+
/*
* We can represent the historical contribution to runnable average as the
* coefficients of a geometric series. To do this we sub-divide our runnable
@@ -2571,6 +2886,12 @@
* load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
* = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
*/
+/*
+ * If the cpu capacity is realized appropriate to the arch
+ * hmp_load_sum must be multiplied by cpu_capacity
+ * currently, util_sum is decayed accumulation of running time
+ * multiplied by SCHED_CAPACITY_SCALE (same with NICE_0_LOAD)
+ */
static __always_inline int
__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
unsigned long weight, int running, struct cfs_rq *cfs_rq)
@@ -2579,8 +2900,35 @@
u32 contrib;
unsigned int delta_w, scaled_delta_w, decayed = 0;
unsigned long scale_freq, scale_cpu;
+#ifdef CONFIG_FREQVAR_SCHEDTUNE
+ unsigned int boost_vector = 1024;
+ unsigned int cap;
+
+ /*
+ * boost task load(util_sum/avg) and load of cfs_rq is not included.
+ * boost ratio is changed with frequency scale.
+ * 1024 is default boost_vector. it is no effect.
+ * if boost_vector is 2048, it means adding twice bigger load than orinal load
+ */
+ if (cfs_rq && cfs_rq->nr_running)
+ cap = 1024 - (sa->util_avg / cfs_rq->nr_running);
+ else
+ cap = 1024 - sa->util_avg;
+ boost_vector += (cap * schedtune_freqvar_boost(cpu)) >> SCHED_CAPACITY_SHIFT;
+#endif
delta = now - sa->last_update_time;
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+ /*
+ * delta can be scaled for some purpose (i.e sched based hotplug out)
+ * To use the scale converting for rq (or cfs_rq)
+ * Implement the hmp_variable_scale_convert' for rq
+ */
+ if (cfs_rq)
+ delta = hmp_rq_variable_scale_convert(delta);
+ else
+ delta = hmp_variable_scale_convert(delta);
+#endif
/*
* This should only happen when time goes backwards, which it
* unfortunately does during sched clock init when we swap over to TSC.
@@ -2619,13 +2967,26 @@
scaled_delta_w = cap_scale(delta_w, scale_freq);
if (weight) {
sa->load_sum += weight * scaled_delta_w;
+#ifdef CONFIG_SCHED_HMP
+ sa->hmp_load_sum += scaled_delta_w;
+#endif
if (cfs_rq) {
cfs_rq->runnable_load_sum +=
weight * scaled_delta_w;
}
}
- if (running)
+ if (running) {
+#ifndef CONFIG_FREQVAR_SCHEDTUNE
sa->util_sum += scaled_delta_w * scale_cpu;
+#else
+ /* applying utilization boost */
+ sa->util_sum += (scaled_delta_w * scale_cpu * boost_vector)
+ >> SCHED_CAPACITY_SHIFT;
+ trace_schedtune_boost_util(cfs_rq ? "RQ" : "TASK", scaled_delta_w * scale_cpu,
+ (scaled_delta_w * scale_cpu * boost_vector) >> SCHED_CAPACITY_SHIFT,
+ boost_vector);
+#endif
+ }
delta -= delta_w;
@@ -2634,6 +2995,9 @@
delta %= 1024;
sa->load_sum = decay_load(sa->load_sum, periods + 1);
+#ifdef CONFIG_SCHED_HMP
+ sa->hmp_load_sum = decay_load(sa->hmp_load_sum, periods + 1);
+#endif
if (cfs_rq) {
cfs_rq->runnable_load_sum =
decay_load(cfs_rq->runnable_load_sum, periods + 1);
@@ -2645,17 +3009,34 @@
contrib = cap_scale(contrib, scale_freq);
if (weight) {
sa->load_sum += weight * contrib;
+#ifdef CONFIG_SCHED_HMP
+ sa->hmp_load_sum += contrib;
+#endif
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * contrib;
}
- if (running)
+ if (running) {
+#ifndef CONFIG_FREQVAR_SCHEDTUNE
sa->util_sum += contrib * scale_cpu;
+#else
+ /* applying utilization boost */
+ sa->util_sum += (contrib * scale_cpu * boost_vector)
+ >> SCHED_CAPACITY_SHIFT;
+ trace_schedtune_boost_util(cfs_rq ? "RQ" : "TASK", contrib * scale_cpu,
+ (contrib * scale_cpu * boost_vector) >> SCHED_CAPACITY_SHIFT,
+ boost_vector);
+#endif
+ }
}
/* Remainder of delta accrued against u_0` */
scaled_delta = cap_scale(delta, scale_freq);
if (weight) {
sa->load_sum += weight * scaled_delta;
+#ifdef CONFIG_SCHED_HMP
+ sa->hmp_load_sum += scaled_delta;
+#endif
+
if (cfs_rq)
cfs_rq->runnable_load_sum += weight * scaled_delta;
}
@@ -2666,10 +3047,21 @@
if (decayed) {
sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+#ifdef CONFIG_SCHED_HMP
+ sa->hmp_load_avg = sa->hmp_load_sum * scale_load_down(NICE_0_LOAD);
+ sa->hmp_load_avg = div_u64(sa->hmp_load_avg, LOAD_AVG_MAX);
+#endif
if (cfs_rq) {
cfs_rq->runnable_load_avg =
div_u64(cfs_rq->runnable_load_sum, LOAD_AVG_MAX);
}
+#ifdef CONFIG_SCHED_HMP
+ else {
+ if(!hmp_cpu_is_fastest(cpu) &&
+ sa->hmp_load_avg > hmp_up_threshold)
+ cpu_rq(smp_processor_id())->next_balance = jiffies;
+ }
+#endif
sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
}
@@ -2691,46 +3083,113 @@
}
}
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)),
+ &se->avg, 0, 0, NULL);
+ se->avg.last_update_time = n_last_update_time;
+ }
+}
#else /* CONFIG_FAIR_GROUP_SCHED */
static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {}
#endif /* CONFIG_FAIR_GROUP_SCHED */
static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
-/*
- * Unsigned subtract and clamp on underflow.
- *
- * Explicitly do a load-store to ensure the intermediate value never hits
- * memory. This allows lockless observations without ever seeing the negative
- * values.
- */
-#define sub_positive(_ptr, _val) do { \
- typeof(_ptr) ptr = (_ptr); \
- typeof(*ptr) val = (_val); \
- typeof(*ptr) res, var = READ_ONCE(*ptr); \
- res = var - val; \
- if (res > var) \
- res = 0; \
- WRITE_ONCE(*ptr, res); \
-} while (0)
+static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
+{
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+
+ if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) {
+ unsigned long max = rq->cpu_capacity_orig;
+ unsigned long req_cap = boosted_cpu_util(cfs_rq->avg.util_avg, cpu);
+
+ /*
+ * There are a few boundary cases this might miss but it should
+ * get called often enough that that should (hopefully) not be
+ * a real problem -- added to that it only calls on the local
+ * CPU, so if we enqueue remotely we'll miss an update, but
+ * the next tick/schedule should update.
+ *
+ * It will not get called when we go idle, because the idle
+ * thread is a different class (!fair), nor will the utilization
+ * number include things like RT tasks.
+ *
+ * As is, the util number is not freq-invariant (we'd have to
+ * implement arch_scale_freq_capacity() for that).
+ *
+ * See cpu_util().
+ */
+
+ trace_sched_rq_util_avg(cpu_of(rq_of(cfs_rq)), cfs_rq->avg.util_avg,
+ req_cap);
+
+ cpufreq_update_util(rq_clock(rq), min(req_cap, max), max);
+ }
+}
/* Group cfs_rq's load_avg is used for task_h_load and update_cfs_share */
-static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
+static inline int
+update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq, bool update_freq)
{
struct sched_avg *sa = &cfs_rq->avg;
- int decayed, removed = 0;
+ int decayed, removed_load = 0, removed_util = 0;
if (atomic_long_read(&cfs_rq->removed_load_avg)) {
s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0);
- sub_positive(&sa->load_avg, r);
- sub_positive(&sa->load_sum, r * LOAD_AVG_MAX);
- removed = 1;
+ sa->load_avg = max_t(long, sa->load_avg - r, 0);
+ sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+#ifdef CONFIG_SCHED_HMP
+ sa->hmp_load_avg = max_t(long, sa->hmp_load_avg - r, 0);
+#endif
+ removed_load = 1;
}
if (atomic_long_read(&cfs_rq->removed_util_avg)) {
long r = atomic_long_xchg(&cfs_rq->removed_util_avg, 0);
- sub_positive(&sa->util_avg, r);
- sub_positive(&sa->util_sum, r * LOAD_AVG_MAX);
+ sa->util_avg = max_t(long, sa->util_avg - r, 0);
+ sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ removed_util = 1;
}
decayed = __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
@@ -2741,7 +3200,10 @@
cfs_rq->load_last_update_time_copy = sa->last_update_time;
#endif
- return decayed || removed;
+ if (update_freq && (decayed || removed_util))
+ cfs_rq_util_change(cfs_rq);
+
+ return decayed || removed_load;
}
/* Update task and its cfs_rq load average */
@@ -2749,19 +3211,36 @@
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 now = cfs_rq_clock_task(cfs_rq);
- int cpu = cpu_of(rq_of(cfs_rq));
+ struct rq *rq = rq_of(cfs_rq);
+ int cpu = cpu_of(rq);
+ int decayed;
/*
* Track task load average for carrying it to new CPU after migrated, and
* track group sched_entity load average for task_h_load calc in migration
*/
- __update_load_avg(now, cpu, &se->avg,
+ decayed = __update_load_avg(now, cpu, &se->avg,
se->on_rq * scale_load_down(se->load.weight),
cfs_rq->curr == se, NULL);
- if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg)
+ if (decayed) {
+#ifdef CONFIG_SCHED_HMP
+ trace_sched_rq_runnable_ratio(cpu_of(rq_of(cfs_rq)), cfs_rq->avg.hmp_load_avg);
+#endif
+ trace_sched_rq_runnable_load(cpu_of(rq_of(cfs_rq)), cfs_rq->runnable_load_avg);
+
+ if(entity_is_task(se)) {
+#ifdef CONFIG_SCHED_HMP
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.hmp_load_avg);
+#endif
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg);
+ trace_sched_task_util_contrib(task_of(se), se->avg.util_avg);
+ }
+ }
+
+ if (update_cfs_rq_load_avg(now, cfs_rq, true) && update_tg)
update_tg_load_avg(cfs_rq, 0);
-}
+ }
static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
@@ -2773,13 +3252,21 @@
* have aged the average right before clearing @last_update_time.
*/
if (se->avg.last_update_time) {
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
- &se->avg, 0, 0, NULL);
+ if(__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ &se->avg, 0, 0, NULL)) {
/*
* XXX: we could have just aged the entire load away if we've been
* absent from the fair class for too long.
*/
+ if(entity_is_task(se)) {
+#ifdef CONFIG_SCHED_HMP
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.hmp_load_avg);
+#endif
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg);
+ trace_sched_task_util_contrib(task_of(se), se->avg.util_avg);
+ }
+ }
}
skip_aging:
@@ -2788,18 +3275,40 @@
cfs_rq->avg.load_sum += se->avg.load_sum;
cfs_rq->avg.util_avg += se->avg.util_avg;
cfs_rq->avg.util_sum += se->avg.util_sum;
+#ifdef CONFIG_SCHED_HMP
+ cfs_rq->avg.hmp_load_avg += se->avg.hmp_load_avg;
+ cfs_rq->avg.hmp_load_sum += se->avg.hmp_load_sum;
+ trace_sched_rq_runnable_ratio(cpu_of(rq_of(cfs_rq)), cfs_rq->avg.hmp_load_avg);
+#endif
+
+ cfs_rq_util_change(cfs_rq);
}
static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- __update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
+ if(__update_load_avg(cfs_rq->avg.last_update_time, cpu_of(rq_of(cfs_rq)),
&se->avg, se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ cfs_rq->curr == se, NULL)) {
+ if(entity_is_task(se)) {
+#ifdef CONFIG_SCHED_HMP
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.hmp_load_avg);
+#endif
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg);
+ trace_sched_task_util_contrib(task_of(se), se->avg.util_avg);
+ }
+ }
- sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg);
- sub_positive(&cfs_rq->avg.load_sum, se->avg.load_sum);
- sub_positive(&cfs_rq->avg.util_avg, se->avg.util_avg);
- sub_positive(&cfs_rq->avg.util_sum, se->avg.util_sum);
+ cfs_rq->avg.load_avg = max_t(long, cfs_rq->avg.load_avg - se->avg.load_avg, 0);
+ cfs_rq->avg.load_sum = max_t(s64, cfs_rq->avg.load_sum - se->avg.load_sum, 0);
+ cfs_rq->avg.util_avg = max_t(long, cfs_rq->avg.util_avg - se->avg.util_avg, 0);
+ cfs_rq->avg.util_sum = max_t(s32, cfs_rq->avg.util_sum - se->avg.util_sum, 0);
+#ifdef CONFIG_SCHED_HMP
+ cfs_rq->avg.hmp_load_avg = max_t(long, cfs_rq->avg.hmp_load_avg - se->avg.hmp_load_avg, 0);
+ cfs_rq->avg.hmp_load_sum = max_t(s64, cfs_rq->avg.hmp_load_sum - se->avg.hmp_load_sum, 0);
+ trace_sched_rq_runnable_ratio(cpu_of(rq_of(cfs_rq)), cfs_rq->avg.hmp_load_avg);
+#endif
+
+ cfs_rq_util_change(cfs_rq);
}
/* Add the load generated by se into cfs_rq's load average */
@@ -2812,15 +3321,24 @@
migrated = !sa->last_update_time;
if (!migrated) {
- __update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
+ if(__update_load_avg(now, cpu_of(rq_of(cfs_rq)), sa,
se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL);
+ cfs_rq->curr == se, NULL)) {
+ if(entity_is_task(se)) {
+#ifdef CONFIG_SCHED_HMP
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.hmp_load_avg);
+#endif
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg);
+ trace_sched_task_util_contrib(task_of(se), se->avg.util_avg);
+ }
+ }
}
- decayed = update_cfs_rq_load_avg(now, cfs_rq);
+ decayed = update_cfs_rq_load_avg(now, cfs_rq, !migrated);
cfs_rq->runnable_load_avg += sa->load_avg;
cfs_rq->runnable_load_sum += sa->load_sum;
+ trace_sched_rq_runnable_load(cpu_of(rq_of(cfs_rq)), cfs_rq->runnable_load_avg);
if (migrated)
attach_entity_load_avg(cfs_rq, se);
@@ -2837,6 +3355,7 @@
cfs_rq->runnable_load_avg =
max_t(long, cfs_rq->runnable_load_avg - se->avg.load_avg, 0);
+ trace_sched_rq_runnable_load(cpu_of(rq_of(cfs_rq)), cfs_rq->runnable_load_avg);
cfs_rq->runnable_load_sum =
max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0);
}
@@ -2845,7 +3364,7 @@
* Task first catches up with cfs_rq, and then subtract
* itself from the cfs_rq (task must be off the queue now).
*/
-void remove_entity_load_avg(struct sched_entity *se)
+static void remove_entity_load_avg(struct sched_entity *se)
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
u64 last_update_time;
@@ -2865,6 +3384,14 @@
__update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL);
atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg);
atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg);
+
+ if(entity_is_task(se)) {
+ trace_sched_task_load_contrib(task_of(se), se->avg.load_avg);
+ trace_sched_task_util_contrib(task_of(se), se->avg.util_avg);
+#ifdef CONFIG_SCHED_HMP
+ trace_sched_task_runnable_ratio(task_of(se), se->avg.hmp_load_avg);
+#endif
+ }
}
/*
@@ -2963,6 +3490,7 @@
}
trace_sched_stat_blocked(tsk, delta);
+ trace_sched_blocked_reason(tsk);
/*
* Blocking time is in units of nanosecs, so shift by
@@ -3137,6 +3665,7 @@
if (se != cfs_rq->curr)
__dequeue_entity(cfs_rq, se);
se->on_rq = 0;
+
account_entity_dequeue(cfs_rq, se);
/*
@@ -3934,26 +4463,6 @@
if (!cfs_bandwidth_used())
return;
- /* Synchronize hierarchical throttle counter: */
- if (unlikely(!cfs_rq->throttle_uptodate)) {
- struct rq *rq = rq_of(cfs_rq);
- struct cfs_rq *pcfs_rq;
- struct task_group *tg;
-
- cfs_rq->throttle_uptodate = 1;
-
- /* Get closest up-to-date node, because leaves go first: */
- for (tg = cfs_rq->tg->parent; tg; tg = tg->parent) {
- pcfs_rq = tg->cfs_rq[cpu_of(rq)];
- if (pcfs_rq->throttle_uptodate)
- break;
- }
- if (tg) {
- cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
- }
- }
-
/* an active group must be handled by the update_curr()->put() path */
if (!cfs_rq->runtime_enabled || cfs_rq->curr)
return;
@@ -4235,8 +4744,10 @@
update_cfs_shares(cfs_rq);
}
- if (!se)
+ if (!se) {
add_nr_running(rq, 1);
+ schedtune_enqueue_task(p, cpu_of(rq));
+ }
hrtick_update(rq);
}
@@ -4270,14 +4781,15 @@
/* Don't dequeue parent if it has other entities besides us */
if (cfs_rq->load.weight) {
- /* Avoid re-evaluating load for this entity: */
- se = parent_entity(se);
/*
* Bias pick_next to pick a task from this cfs_rq, as
* p is sleeping when it is within its sched_slice.
*/
- if (task_sleep && se && !throttled_hierarchy(cfs_rq))
- set_next_buddy(se);
+ if (task_sleep && parent_entity(se))
+ set_next_buddy(parent_entity(se));
+
+ /* avoid re-evaluating load for this entity */
+ se = parent_entity(se);
break;
}
flags |= DEQUEUE_SLEEP;
@@ -4294,8 +4806,10 @@
update_cfs_shares(cfs_rq);
}
- if (!se)
+ if (!se) {
sub_nr_running(rq, 1);
+ schedtune_dequeue_task(p, cpu_of(rq));
+ }
hrtick_update(rq);
}
@@ -4528,10 +5042,12 @@
return cpu_rq(cpu)->cpu_capacity;
}
+#ifndef CONFIG_SCHED_USE_FLUID_RT
static unsigned long capacity_orig_of(int cpu)
{
return cpu_rq(cpu)->cpu_capacity_orig;
}
+#endif
static unsigned long cpu_avg_load_per_task(int cpu)
{
@@ -4644,24 +5160,19 @@
return wl;
for_each_sched_entity(se) {
- struct cfs_rq *cfs_rq = se->my_q;
- long W, w = cfs_rq_load_avg(cfs_rq);
+ long w, W;
- tg = cfs_rq->tg;
+ tg = se->my_q->tg;
/*
* W = @wg + \Sum rw_j
*/
- W = wg + atomic_long_read(&tg->load_avg);
-
- /* Ensure \Sum rw_j >= rw_i */
- W -= cfs_rq->tg_load_avg_contrib;
- W += w;
+ W = wg + calc_tg_weight(tg, se->my_q);
/*
* w = rw_i + @wl
*/
- w += wl;
+ w = cfs_rq_load_avg(se->my_q) + wl;
/*
* wl = S * s'_i; see (2)
@@ -4796,6 +5307,37 @@
return 1;
}
+static inline unsigned long task_util(struct task_struct *p)
+{
+ return p->se.avg.util_avg;
+}
+
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+static unsigned int capacity_margin = 1280; /* ~20% margin */
+static inline unsigned long boosted_task_util(struct task_struct *task);
+
+static inline bool __task_fits(struct task_struct *p, int cpu, int util)
+{
+ unsigned long capacity = capacity_of(cpu);
+
+ util += boosted_task_util(p);
+
+ return (capacity * 1024) > (util * capacity_margin);
+}
+
+static inline bool task_fits_max(struct task_struct *p, int cpu)
+{
+ return __task_fits(p, cpu, 0);
+}
+
+static int cpu_util(int cpu);
+
+static inline bool task_fits_spare(struct task_struct *p, int cpu)
+{
+ return __task_fits(p, cpu, cpu_util(cpu));
+}
+#endif
+
/*
* find_idlest_group finds and returns the least busy CPU group within the
* domain.
@@ -4808,6 +5350,10 @@
unsigned long min_load = ULONG_MAX, this_load = 0;
int load_idx = sd->forkexec_idx;
int imbalance = 100 + (sd->imbalance_pct-100)/2;
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ struct sched_group *fit_group = NULL;
+ unsigned long fit_capacity = ULONG_MAX;
+#endif
if (sd_flag & SD_BALANCE_WAKE)
load_idx = sd->wake_idx;
@@ -4836,6 +5382,16 @@
load = target_load(i, load_idx);
avg_load += load;
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ /*
+ * Look for most energy-efficient group that can fit
+ * that can fit the task.
+ */
+ if (capacity_of(i) < fit_capacity && task_fits_spare(p, i)) {
+ fit_capacity = capacity_of(i);
+ fit_group = group;
+ }
+#endif
}
/* Adjust by relative CPU capacity of the group */
@@ -4849,6 +5405,11 @@
}
} while (group = group->next, group != sd->groups);
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ if (fit_group)
+ return fit_group;
+#endif
+
if (!idlest || 100*this_load < imbalance*min_load)
return NULL;
return idlest;
@@ -4869,7 +5430,11 @@
/* Traverse only the allowed CPUs */
for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ if (task_fits_spare(p, i)) {
+#else
if (idle_cpu(i)) {
+#endif
struct rq *rq = cpu_rq(i);
struct cpuidle_state *idle = idle_get_state(rq);
if (idle && idle->exit_latency < min_exit_latency) {
@@ -4881,7 +5446,12 @@
min_exit_latency = idle->exit_latency;
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ } else if (idle_cpu(i) &&
+ (!idle || idle->exit_latency == min_exit_latency) &&
+#else
} else if ((!idle || idle->exit_latency == min_exit_latency) &&
+#endif
rq->idle_stamp > latest_idle_timestamp) {
/*
* If equal or no active idle state, then
@@ -4890,6 +5460,15 @@
*/
latest_idle_timestamp = rq->idle_stamp;
shallowest_idle_cpu = i;
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ } else if (shallowest_idle_cpu == -1) {
+ /*
+ * If we haven't found an idle CPU yet
+ * pick a non-idle one that can fit the task as
+ * fallback.
+ */
+ shallowest_idle_cpu = i;
+#endif
}
} else if (shallowest_idle_cpu == -1) {
load = weighted_cpuload(i);
@@ -4948,6 +5527,1292 @@
return target;
}
+#ifdef CONFIG_SCHED_HMP
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * The cpu types are distinguished using a list of hmp_domains
+ * which each represent one cpu type using a cpumask.
+ * The list is assumed ordered by compute capacity with the
+ * fastest domain first.
+ */
+DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+static const int hmp_max_tasks=5;
+
+extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
+
+/* Setup hmp_domains */
+static int __init hmp_cpu_mask_setup(void)
+{
+ char buf[64];
+ struct hmp_domain *domain;
+ struct list_head *pos;
+ int dc, cpu;
+
+ pr_debug("Initializing HMP scheduler:\n");
+
+ /* Initialize hmp_domains using platform code */
+ arch_get_hmp_domains(&hmp_domains);
+ if (list_empty(&hmp_domains)) {
+ pr_debug("HMP domain list is empty!\n");
+ return 0;
+ }
+
+ /* Print hmp_domains */
+ dc = 0;
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ // HACK : temporarily commented out : cpulist_scnprintf(buf, 64, &domain->possible_cpus);
+ pr_debug(" HMP domain %d: %s\n", dc, buf);
+
+ for_each_cpu(cpu, &(domain->possible_cpus)) {
+ per_cpu(hmp_cpu_domain, cpu) = domain;
+ }
+ dc++;
+ }
+
+ return 1;
+}
+
+static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
+{
+ struct hmp_domain *domain;
+ struct list_head *pos;
+
+ list_for_each(pos, &hmp_domains) {
+ domain = list_entry(pos, struct hmp_domain, hmp_domains);
+ if(cpumask_test_cpu(cpu, &domain->possible_cpus))
+ return domain;
+ }
+ return NULL;
+}
+
+static void hmp_online_cpu(int cpu)
+{
+ struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
+
+ if(domain)
+ cpumask_set_cpu(cpu, &domain->cpus);
+}
+
+static void hmp_offline_cpu(int cpu)
+{
+ struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
+
+ if(domain)
+ cpumask_clear_cpu(cpu, &domain->cpus);
+}
+
+/* must hold runqueue lock for queue se is currently on */
+
+static struct sched_entity *hmp_get_heaviest_task(struct sched_entity* se, int migrate_up)
+{
+ int num_tasks = hmp_max_tasks;
+ struct sched_entity *max_se = se;
+ unsigned long int max_ratio = se->avg.hmp_load_avg;
+ const struct cpumask *hmp_target_mask = NULL;
+
+ if (migrate_up) {
+ struct hmp_domain *hmp;
+ if(hmp_cpu_is_fastest(cpu_of(se->cfs_rq->rq)))
+ return max_se;
+
+ hmp = hmp_faster_domain(cpu_of(se->cfs_rq->rq));
+ hmp_target_mask = &hmp->cpus;
+ }
+
+ /* The currently running task is not on the runqueue */
+ se = __pick_first_entity(cfs_rq_of(se));
+
+ while(num_tasks && se) {
+ if (entity_is_task(se)) {
+ if(se->avg.hmp_load_avg > max_ratio &&
+ (hmp_target_mask &&
+ cpumask_intersects(hmp_target_mask,
+ tsk_cpus_allowed(task_of(se))))) {
+ max_se = se;
+ max_ratio = se->avg.hmp_load_avg;
+ }
+ }
+ se = __pick_next_entity(se);
+ num_tasks--;
+ }
+ return max_se;
+}
+
+static struct sched_entity *hmp_get_lightest_task(struct sched_entity* se, int migrate_down)
+{
+ int num_tasks = hmp_max_tasks;
+ struct sched_entity *min_se = se;
+ unsigned long int min_ratio = ULONG_MAX;
+ const struct cpumask *hmp_target_mask = NULL;
+
+ if (migrate_down) {
+ struct hmp_domain *hmp;
+ if(hmp_cpu_is_slowest(cpu_of(se->cfs_rq->rq)))
+ return min_se;
+
+ hmp = hmp_slower_domain(cpu_of(se->cfs_rq->rq));
+ hmp_target_mask = &hmp->cpus;
+ }
+
+ /* The currently running task is not on the runqueue */
+ se = __pick_first_entity(cfs_rq_of(se));
+
+ while(num_tasks && se) {
+ if (entity_is_task(se)) {
+ if(se->avg.hmp_load_avg < min_ratio &&
+ (hmp_target_mask &&
+ cpumask_intersects(hmp_target_mask,
+ tsk_cpus_allowed(task_of(se))))) {
+ min_se = se;
+ min_ratio = se->avg.load_avg;
+ }
+ }
+ se = __pick_next_entity(se);
+ num_tasks--;
+ }
+ return min_se;
+}
+
+/*
+ * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
+ * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
+ * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
+ */
+static int hmp_boostpulse_duration = 1000000; /* microseconds */
+static u64 hmp_boostpulse_endtime;
+static int hmp_boost_val;
+static int hmp_family_boost_val;
+static int hmp_semiboost_val;
+static int hmp_boostpulse;
+static int hmp_active_down_migration;
+static int hmp_aggressive_up_migration;
+static int hmp_aggressive_yield;
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+static int hmp_selective_boost_val;
+#endif
+static DEFINE_RAW_SPINLOCK(hmp_boost_lock);
+static DEFINE_RAW_SPINLOCK(hmp_family_boost_lock);
+static DEFINE_RAW_SPINLOCK(hmp_semiboost_lock);
+static DEFINE_RAW_SPINLOCK(hmp_sysfs_lock);
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+static DEFINE_RAW_SPINLOCK(hmp_selective_boost_lock);
+#endif
+
+#define BOOT_BOOST_DURATION 40000000 /* microseconds */
+#define YIELD_CORRECTION_TIME 10000000 /* nanoseconds */
+
+unsigned int hmp_next_up_threshold = 4096;
+unsigned int hmp_next_down_threshold = 4096;
+
+static inline int hmp_boost(void)
+{
+ u64 now = ktime_to_us(ktime_get());
+ int ret;
+
+ if (hmp_boost_val || now < hmp_boostpulse_endtime)
+ ret = 1;
+ else
+ ret = 0;
+
+ return ret;
+}
+
+static inline int hmp_family_boost(void)
+{
+ if (hmp_family_boost_val)
+ return 1;
+ return 0;
+}
+
+static inline int hmp_semiboost(void)
+{
+ if (hmp_semiboost_val)
+ return 1;
+ return 0;
+}
+
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+static inline int hmp_selective_boost(void)
+{
+ if (hmp_selective_boost_val)
+ return 1;
+ return 0;
+}
+#endif
+
+static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
+static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
+ int *min_cpu, struct cpumask *affinity);
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+static int hmp_selective_migration(int prev_cpu, struct sched_entity *se);
+#endif
+
+/* Check if cpu is in fastest hmp_domain */
+static inline unsigned int hmp_cpu_is_fastest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return pos == hmp_domains.next;
+}
+
+/* Check if cpu is in slowest hmp_domain */
+static inline unsigned int hmp_cpu_is_slowest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_is_last(pos, &hmp_domains);
+}
+
+/* Next (slower) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_slower_domain(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->next, struct hmp_domain, hmp_domains);
+}
+
+/* Previous (faster) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_faster_domain(int cpu)
+{
+ struct list_head *pos;
+
+ if (hmp_cpu_is_fastest(cpu))
+ return hmp_cpu_domain(cpu);
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+
+ return list_entry(pos->prev, struct hmp_domain, hmp_domains);
+}
+
+/*
+ * Selects a cpu in previous (faster) hmp_domain
+ */
+static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
+ int cpu, int *lowest_ratio)
+{
+ int lowest_cpu = NR_CPUS;
+ struct hmp_domain *hmp;
+ if (hmp_cpu_is_fastest(cpu))
+ hmp = hmp_cpu_domain(cpu);
+ else
+ hmp = hmp_faster_domain(cpu);
+
+ *lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+ tsk_cpus_allowed(tsk));
+
+ return lowest_cpu;
+}
+
+/*
+ * Selects a cpu in next (slower) hmp_domain
+ * Note that cpumask_any_and() returns the first cpu in the cpumask
+ */
+static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
+ int cpu)
+{
+ int lowest_cpu=NR_CPUS;
+ struct hmp_domain *hmp;
+ __always_unused int lowest_ratio;
+
+ if (hmp_cpu_is_slowest(cpu))
+ hmp = hmp_cpu_domain(cpu);
+ else
+ hmp = hmp_slower_domain(cpu);
+
+ lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+ tsk_cpus_allowed(tsk));
+
+ return lowest_cpu;
+}
+
+#if defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) && defined(CONFIG_SCHED_HMP)
+/* Load of migrated task is fitted into target cluster */
+static inline void hmp_load_compensation(struct sched_entity *se, u64 compst_ratio)
+{
+ se->avg.load_avg = se->avg.load_avg * compst_ratio >> SCHED_CAPACITY_SHIFT;
+ se->avg.load_sum = se->avg.load_sum * compst_ratio >> SCHED_CAPACITY_SHIFT;
+ se->avg.util_avg = se->avg.util_avg * compst_ratio >> SCHED_CAPACITY_SHIFT;
+ se->avg.util_sum = se->avg.util_sum * compst_ratio >> SCHED_CAPACITY_SHIFT;
+ se->avg.hmp_load_avg = se->avg.hmp_load_avg * compst_ratio >> SCHED_CAPACITY_SHIFT;
+ se->avg.hmp_load_sum = se->avg.hmp_load_sum * compst_ratio >> SCHED_CAPACITY_SHIFT;
+
+ trace_sched_hmp_migration_compensation(se->avg.load_avg,
+ se->avg.util_avg, se->avg.hmp_load_avg);
+}
+
+/* Compensate load of migrated task when up/down-migration happned */
+static inline void hmp_load_migration(struct sched_entity * se,
+ int src, int dst)
+{
+ u64 compst_ratio = 0;
+
+ /* To Check whether dst cpu is one of the src cluster or not */
+ if (cpumask_test_cpu(dst, cpu_coregroup_mask(src)))
+ return;
+
+ compst_ratio = hmp_cpu_is_fastest(dst) ? hmp_up_compst_ratio
+ : hmp_down_compst_ratio;
+
+ hmp_load_compensation(se, compst_ratio);
+}
+#endif
+
+static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
+{
+ /* hack - always use clock from first online CPU */
+ u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+ se->avg.hmp_last_up_migration = now;
+ se->avg.hmp_last_down_migration = 0;
+ cpu_rq(cpu)->hmp_last_up_migration = now;
+ cpu_rq(cpu)->hmp_last_down_migration = 0;
+}
+
+static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
+{
+ /* hack - always use clock from first online CPU */
+ u64 now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+ se->avg.hmp_last_down_migration = now;
+ se->avg.hmp_last_up_migration = 0;
+ cpu_rq(cpu)->hmp_last_up_migration = now;
+ cpu_rq(cpu)->hmp_last_down_migration = 0;
+}
+
+#ifdef CONFIG_HMP_VARIABLE_SCALE
+/*
+ * Heterogenous multiprocessor (HMP) optimizations
+ *
+ * These functions allow to change the growing speed of the hmp_load_ratio
+ * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
+ * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
+ *
+ * These functions also allow to change the up and down threshold of HMP
+ * using /sys/kernel/hmp/{up,down}_threshold.
+ * Both must be between 0 and 1023. The threshold that is compared
+ * to the hmp_load_ratio is up_threshold/1024 and down_threshold/1024.
+ *
+ * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
+ * task with a load of 0 will reach the threshold after 64ms of busy loop.
+ *
+ * Changing load_avg_periods_ms has the same effect than changing the
+ * default scaling factor Y=1002/1024 in the hmp_load_ratio computation to
+ * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
+ * could trigger overflows.
+ * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
+ * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
+ * could be overflowed for a weight > 2^12 even is the load_avg_contrib
+ * should still be a 32bits result. This would not happen by multiplicating
+ * delta time by 1/22 and setting load_avg_period_ms = 706.
+ */
+
+
+/*
+ * By scaling the delta time it end-up increasing or decrease the
+ * growing speed of the per entity hmp_load_ratio
+ * The scale factor hmp_data.multiplier is a fixed point
+ * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
+ */
+static u64 hmp_variable_scale_convert(u64 delta)
+{
+ u64 high = delta >> 32ULL;
+ u64 low = delta & 0xffffffffULL;
+
+ if (hmp_semiboost()) {
+ low *= hmp_data.semiboost_multiplier;
+ high *= hmp_data.semiboost_multiplier;
+ } else {
+ low *= hmp_data.multiplier;
+ high *= hmp_data.multiplier;
+ }
+ return (low >> HMP_VARIABLE_SCALE_SHIFT)
+ + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
+}
+
+static u64 hmp_rq_variable_scale_convert(u64 delta)
+{
+ u64 high = delta >> 32ULL;
+ u64 low = delta & 0xffffffffULL;
+
+ low *= hmp_data.rq_multiplier;
+ high *= hmp_data.rq_multiplier;
+
+ return (low >> HMP_VARIABLE_SCALE_SHIFT)
+ + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
+}
+
+static ssize_t hmp_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ ssize_t ret = 0;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ int temp = *(hmp_attr->value);
+ if (hmp_attr->to_sysfs != NULL)
+ temp = hmp_attr->to_sysfs(temp);
+ ret = sprintf(buf, "%d\n", temp);
+ return ret;
+}
+
+static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
+ const char *buf, size_t count)
+{
+ int temp;
+ ssize_t ret = count;
+ struct hmp_global_attr *hmp_attr =
+ container_of(attr, struct hmp_global_attr, attr);
+ char *str = vmalloc(count + 1);
+ if (str == NULL)
+ return -ENOMEM;
+ memcpy(str, buf, count);
+ str[count] = 0;
+ if (sscanf(str, "%d", &temp) < 1)
+ ret = -EINVAL;
+ else {
+ if (hmp_attr->from_sysfs != NULL) {
+ temp = hmp_attr->from_sysfs(temp);
+ if (temp < 0)
+ ret = temp;
+ } else {
+ *(hmp_attr->value) = temp;
+ }
+ }
+ vfree(str);
+ return ret;
+}
+
+static int hmp_period_to_sysfs(int value)
+{
+ return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
+}
+
+static int hmp_period_from_sysfs(int value)
+{
+ hmp_data.multiplier = (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
+ return 0;
+}
+
+static int hmp_semiboost_period_from_sysfs(int value)
+{
+ hmp_data.semiboost_multiplier = (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
+ return 0;
+}
+
+/* max value for threshold is 1024 */
+static int hmp_up_threshold_from_sysfs(int value)
+{
+ if ((value > 1024) || (value < 0))
+ return -EINVAL;
+
+ hmp_up_threshold = value;
+
+ return 0;
+}
+
+static int hmp_semiboost_up_threshold_from_sysfs(int value)
+{
+ if ((value > 1024) || (value < 0))
+ return -EINVAL;
+
+ hmp_semiboost_up_threshold = value;
+
+ return 0;
+}
+
+static int hmp_down_threshold_from_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+ raw_spin_lock_irqsave(&hmp_sysfs_lock, flags);
+
+ if ((value > 1024) || (value < 0)) {
+ ret = -EINVAL;
+ } else {
+ hmp_down_threshold = value;
+#ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+ hmp_tbsoftlanding.threshold = hmp_down_threshold / 2;
+ hmp_tbsoftlanding_update_thr();
+#endif
+ }
+
+ raw_spin_unlock_irqrestore(&hmp_sysfs_lock, flags);
+
+ return ret;
+}
+
+static int hmp_semiboost_down_threshold_from_sysfs(int value)
+{
+ if ((value > 1024) || (value < 0))
+ return -EINVAL;
+
+ hmp_semiboost_down_threshold = value;
+
+ return 0;
+}
+
+static int hmp_boostpulse_from_sysfs(int value)
+{
+ unsigned long flags;
+ u64 boostpulse_endtime = ktime_to_us(ktime_get()) + hmp_boostpulse_duration;
+
+ raw_spin_lock_irqsave(&hmp_boost_lock, flags);
+ if (boostpulse_endtime > hmp_boostpulse_endtime)
+ hmp_boostpulse_endtime = boostpulse_endtime;
+ raw_spin_unlock_irqrestore(&hmp_boost_lock, flags);
+
+ return 0;
+}
+
+static int hmp_boostpulse_duration_from_sysfs(int duration)
+{
+ if (duration < 0)
+ return -EINVAL;
+
+ hmp_boostpulse_duration = duration;
+
+ return 0;
+}
+
+static int hmp_boost_from_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&hmp_boost_lock, flags);
+ if (value == 1)
+ hmp_boost_val++;
+ else if (value == 0)
+ if (hmp_boost_val >= 1)
+ hmp_boost_val--;
+ else
+ ret = -EINVAL;
+ else
+ ret = -EINVAL;
+ raw_spin_unlock_irqrestore(&hmp_boost_lock, flags);
+
+ return ret;
+}
+
+static int hmp_family_boost_from_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&hmp_family_boost_lock, flags);
+ if (value == 1 || value == 0)
+ hmp_family_boost_val = value;
+ else
+ ret = -EINVAL;
+ raw_spin_unlock_irqrestore(&hmp_family_boost_lock, flags);
+
+ return ret;
+}
+
+static int hmp_semiboost_from_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&hmp_semiboost_lock, flags);
+ if (value == 1)
+ hmp_semiboost_val++;
+ else if (value == 0)
+ if (hmp_semiboost_val >= 1)
+ hmp_semiboost_val--;
+ else
+ ret = -EINVAL;
+ else
+ ret = -EINVAL;
+ raw_spin_unlock_irqrestore(&hmp_semiboost_lock, flags);
+
+ return ret;
+}
+
+static int hmp_active_dm_from_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&hmp_sysfs_lock, flags);
+ if (value == 1)
+ hmp_active_down_migration++;
+ else if (value == 0)
+ if (hmp_active_down_migration >= 1)
+ hmp_active_down_migration--;
+ else
+ ret = -EINVAL;
+ else
+ ret = -EINVAL;
+ raw_spin_unlock_irqrestore(&hmp_sysfs_lock, flags);
+
+ return ret;
+}
+
+static int hmp_aggressive_up_migration_from_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&hmp_sysfs_lock, flags);
+ if (value == 1)
+ hmp_aggressive_up_migration++;
+ else if (value == 0)
+ if (hmp_aggressive_up_migration >= 1)
+ hmp_aggressive_up_migration--;
+ else
+ ret = -EINVAL;
+ else
+ ret = -EINVAL;
+ raw_spin_unlock_irqrestore(&hmp_sysfs_lock, flags);
+
+ return ret;
+}
+
+static int hmp_aggressive_yield_from_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&hmp_sysfs_lock, flags);
+ if (value == 1)
+ hmp_aggressive_yield++;
+ else if (value == 0)
+ if (hmp_aggressive_yield >= 1)
+ hmp_aggressive_yield--;
+ else
+ ret = -EINVAL;
+ else
+ ret = -EINVAL;
+ raw_spin_unlock_irqrestore(&hmp_sysfs_lock, flags);
+
+ return ret;
+}
+
+#ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+static int hmp_tbsoftlanding_enabled_sysfs(int value)
+{
+ int lv;
+
+ if (value > 0) {
+ hmp_tbsoftlanding.enabled = 1;
+ pr_info("hmp_tbsoftlanding is enabled\n");
+ } else {
+ hmp_tbsoftlanding.enabled = 0;
+ for (lv = 0; lv < TBSL_LV_END; lv++)
+ pm_qos_update_request(&hmp_tbsoftlanding.data[lv].pm_qos, 0);
+
+ pr_info("hmp_tbsoftlanding is disabled\n");
+ }
+
+ return 0;
+}
+
+static int hmp_tbsoftlanding_timeout_sysfs(int value)
+{
+ int lv;
+
+ if (value > 0) {
+ hmp_tbsoftlanding.timeout = value;
+ } else {
+ hmp_tbsoftlanding.timeout = 0;
+ hmp_tbsoftlanding.enabled = 0;
+
+ for (lv = 0; lv < TBSL_LV_END; lv++)
+ pm_qos_update_request(&hmp_tbsoftlanding.data[lv].pm_qos, 0);
+
+ pr_info("hmp_tbsoftlanding is disabled\n");
+ }
+
+ return 0;
+}
+
+static int hmp_tbsoftlanding_high_freq_sysfs(int value)
+{
+ int ret = 0;
+
+ if (value >= 0) {
+ hmp_tbsoftlanding.data[TBSL_LV_HIGH].freq = value;
+ } else {
+ pr_info("enter invalid value\n");
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int hmp_tbsoftlanding_mid_freq_sysfs(int value)
+{
+ int ret = 0;
+
+ if (value >= 0) {
+ hmp_tbsoftlanding.data[TBSL_LV_MID].freq = value;
+ } else {
+ pr_info("enter invalid value\n");
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int hmp_tbsoftlanding_low_freq_sysfs(int value)
+{
+ int ret = 0;
+
+ if (value >= 0) {
+ hmp_tbsoftlanding.data[TBSL_LV_LOW].freq = value;
+ } else {
+ pr_info("enter invalid value\n");
+ ret = -EINVAL;
+ }
+
+ return ret;
+}
+
+static int hmp_tbsoftlanding_threshold_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&hmp_sysfs_lock, flags);
+
+ if (value > 0 && value < 1024 && value < hmp_down_threshold) {
+ hmp_tbsoftlanding.threshold = value;
+ hmp_tbsoftlanding_update_thr();
+ } else {
+ pr_info("enter invalid value\n");
+ ret = -EINVAL;
+ }
+
+ raw_spin_unlock_irqrestore(&hmp_sysfs_lock, flags);
+
+ return ret;
+}
+#endif
+
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+static int hmp_selective_boost_from_sysfs(int value)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ raw_spin_lock_irqsave(&hmp_selective_boost_lock, flags);
+ if (value == 1)
+ hmp_selective_boost_val++;
+ else if (value == 0)
+ if (hmp_selective_boost_val >= 1)
+ hmp_selective_boost_val--;
+ else
+ ret = -EINVAL;
+ else
+ ret = -EINVAL;
+ raw_spin_unlock_irqrestore(&hmp_selective_boost_lock, flags);
+
+
+ return ret;
+}
+#endif
+
+int set_hmp_boost(int enable)
+{
+ return hmp_boost_from_sysfs(enable);
+}
+
+int set_hmp_family_boost(int enable)
+{
+ return hmp_family_boost_from_sysfs(enable);
+}
+
+int set_hmp_semiboost(int enable)
+{
+ return hmp_semiboost_from_sysfs(enable);
+}
+
+int set_hmp_boostpulse(int duration)
+{
+ unsigned long flags;
+ u64 boostpulse_endtime;
+
+ if (duration < 0)
+ return -EINVAL;
+
+ boostpulse_endtime = ktime_to_us(ktime_get()) + duration;
+
+ raw_spin_lock_irqsave(&hmp_boost_lock, flags);
+ if (boostpulse_endtime > hmp_boostpulse_endtime)
+ hmp_boostpulse_endtime = boostpulse_endtime;
+ raw_spin_unlock_irqrestore(&hmp_boost_lock, flags);
+
+ return 0;
+}
+
+int set_active_down_migration(int enable)
+{
+ return hmp_active_dm_from_sysfs(enable);
+}
+
+int set_hmp_aggressive_up_migration(int enable)
+{
+ return hmp_aggressive_up_migration_from_sysfs(enable);
+}
+
+int set_hmp_aggressive_yield(int enable)
+{
+ return hmp_aggressive_yield_from_sysfs(enable);
+}
+
+int get_hmp_boost(void)
+{
+ return hmp_boost();
+}
+
+int get_hmp_semiboost(void)
+{
+ return hmp_semiboost();
+}
+
+int set_hmp_up_threshold(int value)
+{
+ return hmp_up_threshold_from_sysfs(value);
+}
+
+int set_hmp_down_threshold(int value)
+{
+ return hmp_down_threshold_from_sysfs(value);
+}
+/* packing value must be non-negative */
+static int hmp_packing_enable_from_sysfs(int value)
+{
+ if (value < 0 || value > 1)
+ return -1;
+
+ hmp_packing_enabled = value;
+ return value;
+}
+
+static int hmp_packing_threshold_from_sysfs(int value)
+{
+ if (value < 0 || value > 1023)
+ return -1;
+
+ hmp_packing_threshold = value;
+ return value;
+}
+
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+int set_hmp_selective_boost(int enable)
+{
+ return hmp_selective_boost_from_sysfs(enable);
+}
+
+int get_hmp_selective_boost(void)
+{
+ return hmp_selective_boost();
+}
+#endif
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+/* freqinvar control is only 0,1 off/on */
+static int hmp_freqinvar_from_sysfs(int value)
+{
+ if (value < 0 || value > 1)
+ return -1;
+ return value;
+}
+#endif
+static void hmp_attr_add(
+ const char *name,
+ int *value,
+ int (*to_sysfs)(int),
+ int (*from_sysfs)(int))
+{
+ int i = 0;
+ while (hmp_data.attributes[i] != NULL) {
+ i++;
+ if (i >= HMP_DATA_SYSFS_MAX)
+ return;
+ }
+ hmp_data.attr[i].attr.mode = 0644;
+ hmp_data.attr[i].show = hmp_show;
+ hmp_data.attr[i].store = hmp_store;
+ hmp_data.attr[i].attr.name = name;
+ hmp_data.attr[i].value = value;
+ hmp_data.attr[i].to_sysfs = to_sysfs;
+ hmp_data.attr[i].from_sysfs = from_sysfs;
+ hmp_data.attributes[i] = &hmp_data.attr[i].attr;
+ hmp_data.attributes[i + 1] = NULL;
+}
+
+static int hmp_attr_init(void)
+{
+ int ret;
+
+ hmp_attr_add("load_avg_period_ms",
+ &hmp_data.multiplier,
+ hmp_period_to_sysfs,
+ hmp_period_from_sysfs);
+ hmp_attr_add("up_threshold",
+ &hmp_up_threshold,
+ NULL,
+ hmp_up_threshold_from_sysfs);
+ hmp_attr_add("down_threshold",
+ &hmp_down_threshold,
+ NULL,
+ hmp_down_threshold_from_sysfs);
+
+ hmp_attr_add("sb_load_avg_period_ms",
+ &hmp_data.semiboost_multiplier,
+ hmp_period_to_sysfs,
+ hmp_semiboost_period_from_sysfs);
+ hmp_attr_add("sb_up_threshold",
+ &hmp_semiboost_up_threshold,
+ NULL,
+ hmp_semiboost_up_threshold_from_sysfs);
+ hmp_attr_add("sb_down_threshold",
+ &hmp_semiboost_down_threshold,
+ NULL,
+ hmp_semiboost_down_threshold_from_sysfs);
+ hmp_attr_add("semiboost",
+ &hmp_semiboost_val,
+ NULL,
+ hmp_semiboost_from_sysfs);
+
+ hmp_attr_add("boostpulse",
+ &hmp_boostpulse,
+ NULL,
+ hmp_boostpulse_from_sysfs);
+ hmp_attr_add("boostpulse_duration",
+ &hmp_boostpulse_duration,
+ NULL,
+ hmp_boostpulse_duration_from_sysfs);
+ hmp_attr_add("boost",
+ &hmp_boost_val,
+ NULL,
+ hmp_boost_from_sysfs);
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ hmp_attr_add("selective_boost",
+ &hmp_selective_boost_val,
+ NULL,
+ hmp_selective_boost_from_sysfs);
+#endif
+ hmp_attr_add("family_boost",
+ &hmp_family_boost_val,
+ NULL,
+ hmp_family_boost_from_sysfs);
+
+ hmp_attr_add("active_down_migration",
+ &hmp_active_down_migration,
+ NULL,
+ hmp_active_dm_from_sysfs);
+
+ hmp_attr_add("aggressive_up_migration",
+ &hmp_aggressive_up_migration,
+ NULL,
+ hmp_aggressive_up_migration_from_sysfs);
+
+ hmp_attr_add("aggressive_yield",
+ &hmp_aggressive_yield,
+ NULL,
+ hmp_aggressive_yield_from_sysfs);
+
+#ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+ hmp_attr_add("down_compensation_enabled",
+ &hmp_tbsoftlanding.enabled,
+ NULL,
+ hmp_tbsoftlanding_enabled_sysfs);
+
+ hmp_attr_add("down_compensation_timeout",
+ &hmp_tbsoftlanding.timeout,
+ NULL,
+ hmp_tbsoftlanding_timeout_sysfs);
+
+ hmp_attr_add("down_compensation_high_freq",
+ &hmp_tbsoftlanding.data[TBSL_LV_HIGH].freq,
+ NULL,
+ hmp_tbsoftlanding_high_freq_sysfs);
+
+ hmp_attr_add("down_compensation_mid_freq",
+ &hmp_tbsoftlanding.data[TBSL_LV_MID].freq,
+ NULL,
+ hmp_tbsoftlanding_mid_freq_sysfs);
+
+ hmp_attr_add("down_compensation_low_freq",
+ &hmp_tbsoftlanding.data[TBSL_LV_LOW].freq,
+ NULL,
+ hmp_tbsoftlanding_low_freq_sysfs);
+
+ hmp_attr_add("down_compensation_threshold",
+ &hmp_tbsoftlanding.threshold,
+ NULL,
+ hmp_tbsoftlanding_threshold_sysfs);
+#endif
+
+ hmp_attr_add("packing_enable",
+ &hmp_packing_enabled,
+ NULL,
+ hmp_packing_enable_from_sysfs);
+ hmp_attr_add("packing_threshold",
+ &hmp_packing_threshold,
+ NULL,
+ hmp_packing_threshold_from_sysfs);
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ /* default frequency-invariant scaling ON */
+ hmp_data.freqinvar_load_scale_enabled = 1;
+ hmp_attr_add("frequency_invariant_load_scale",
+ &hmp_data.freqinvar_load_scale_enabled,
+ NULL,
+ hmp_freqinvar_from_sysfs);
+#endif
+ hmp_data.attr_group.name = "hmp";
+ hmp_data.attr_group.attrs = hmp_data.attributes;
+ ret = sysfs_create_group(kernel_kobj,
+ &hmp_data.attr_group);
+ return 0;
+}
+late_initcall(hmp_attr_init);
+#endif /* CONFIG_HMP_VARIABLE_SCALE */
+
+
+/*
+ * return the load of the lowest-loaded CPU in a given HMP domain
+ * min_cpu optionally points to an int to receive the CPU.
+ * affinity optionally points to a cpumask containing the
+ * CPUs to be considered. note:
+ * + min_cpu = NR_CPUS only if no CPUs are in the set of
+ * affinity && hmp_domain cpus
+ * + min_cpu will always otherwise equal one of the CPUs in
+ * the hmp domain
+ * + when more than one CPU has the same load, the one which
+ * is least-recently-disturbed by an HMP migration will be
+ * selected
+ * + if all CPUs are equally loaded or idle and the times are
+ * all the same, the first in the set will be used
+ * + if affinity is not set, cpu_online_mask is used
+ */
+static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
+ int *min_cpu, struct cpumask *affinity)
+{
+ unsigned long load, min_load = ULONG_MAX;
+ unsigned int min_exit_latency = UINT_MAX;
+ int least_loaded_cpu = NR_CPUS;
+ int shallowest_idle_cpu = -1;
+ int i;
+ struct cpumask temp_cpumask;
+ u64 curr_last_migration;
+ u64 min_target_last_migration = ULLONG_MAX;
+
+ /*
+ * only look at CPUs allowed if specified,
+ * always consider online CPUs in the right HMP domain
+ */
+ cpumask_and(&temp_cpumask, &hmpd->cpus, cpu_online_mask);
+ if (affinity)
+ cpumask_and(&temp_cpumask, &temp_cpumask, affinity);
+
+
+ /* Traverse only the allowed CPUs */
+ rcu_read_lock();
+ for_each_cpu(i, &temp_cpumask) {
+ struct rq *rq = cpu_rq(i);
+ if (idle_cpu(i)) {
+ struct cpuidle_state *idle = idle_get_state(rq);
+ if (idle && idle->exit_latency < min_exit_latency) {
+ /*
+ * We give priority to a CPU whose idle state
+ * has the smallest exit latency irrespective
+ * of any idle timestamp.
+ */
+ min_exit_latency = idle->exit_latency;
+ shallowest_idle_cpu = i;
+ min_load = 0;
+ } else if (!idle || idle->exit_latency == min_exit_latency) {
+ /*
+ * If equal or no active idle state, then
+ * the most earlier migrated CPU wins the CPU
+ * This is to spread migration load between
+ * members of a domain more evenly when the
+ * domain is fully loaded.
+ */
+ curr_last_migration = rq->hmp_last_down_migration ?
+ rq->hmp_last_down_migration : rq->hmp_last_up_migration;
+
+ if ((curr_last_migration < min_target_last_migration) ||
+ (shallowest_idle_cpu != -1)) {
+
+ shallowest_idle_cpu = i;
+ min_load = 0;
+ }
+ }
+ } else if (shallowest_idle_cpu == -1) {
+ load = weighted_cpuload(i);
+ curr_last_migration = rq->hmp_last_down_migration ?
+ rq->hmp_last_down_migration : rq->hmp_last_up_migration;
+
+ if ((load < min_load) ||
+ ((load == min_load) && (curr_last_migration < min_target_last_migration))) {
+ min_load = load;
+ least_loaded_cpu = i;
+ }
+ }
+ }
+ rcu_read_unlock();
+
+ if(min_cpu) {
+ if(shallowest_idle_cpu != -1)
+ *min_cpu = shallowest_idle_cpu;
+ else
+ *min_cpu = least_loaded_cpu;
+ }
+
+ return min_load;
+}
+
+#define RQ_IDLE_PERIOD NSEC_PER_SEC / HZ
+
+static unsigned long __maybe_unused hmp_domain_sum_load(struct hmp_domain *hmpd)
+{
+ int cpu;
+ unsigned long sum = 0;
+ unsigned long load;
+
+ for_each_cpu(cpu, &hmpd->cpus) {
+ load = cpu_rq(cpu)->cfs.sysload_avg_ratio;
+ sum += load;
+ }
+
+ return sum;
+}
+
+static inline unsigned int hmp_domain_nr_running(struct hmp_domain *hmpd)
+{
+ int cpu;
+ unsigned int nr = 0;
+
+ for_each_cpu(cpu, &hmpd->cpus) {
+ nr += cpu_rq(cpu)->cfs.h_nr_running;
+ }
+
+ return nr;
+}
+
+/*
+ * Calculate the task starvation
+ * This is the ratio of actually running time vs. runnable time.
+ * If the two are equal the task is getting the cpu time it needs or
+ * it is alone on the cpu and the cpu is fully utilized.
+ */
+/*
+ * If the cpu capacity is realized appropriate to the arch.
+ * hmp_load_sum must be multiplied by cpu_capacity
+ * currently, util_sum is decayed accumulation of running time
+ * multiplied by SCHED_CAPACITY_SCALE (same with NICE_0_LOAD)
+ */
+static inline unsigned int hmp_task_starvation(struct sched_entity *se)
+{
+ u32 starvation;
+
+ starvation = se->avg.util_sum;
+ starvation /= (se->avg.load_sum + 1);
+
+ return scale_load(starvation);
+}
+
+static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
+{
+ int min_usage;
+ int dest_cpu = NR_CPUS;
+
+ if (hmp_cpu_is_slowest(cpu) || hmp_aggressive_up_migration)
+ return NR_CPUS;
+
+ /* Is there an idle CPU in the current domain */
+ min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL);
+ if (min_usage == 0){
+ trace_sched_hmp_offload_abort(cpu,min_usage,"load");
+ return NR_CPUS;
+ }
+
+ /* Is the task alone on the cpu? */
+ if (cpu_rq(cpu)->cfs.h_nr_running < 2) {
+ trace_sched_hmp_offload_abort(cpu,cpu_rq(cpu)->cfs.h_nr_running,"nr_running");
+ return NR_CPUS;
+ }
+
+ /* Is the task actually starving? */
+ if (hmp_task_starvation(se) > 768) /* <25% waiting */ {
+ trace_sched_hmp_offload_abort(cpu,hmp_task_starvation(se),"starvation");
+ return NR_CPUS;
+ }
+
+ /* Does the slower domain have any idle CPUs? */
+ min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu,
+ tsk_cpus_allowed(task_of(se)));
+
+ if (min_usage == 0){
+ trace_sched_hmp_offload_succeed(cpu, dest_cpu);
+ return dest_cpu;
+ } else {
+ trace_sched_hmp_offload_abort(cpu, min_usage, "slowdomain");
+ }
+ return NR_CPUS;
+}
+
+static int hmp_is_family_in_fastest_domain(struct task_struct *p)
+{
+ struct task_struct *thread_p;
+
+ list_for_each_entry(thread_p, &p->thread_group, thread_group) {
+ struct sched_entity *thread_se = &thread_p->se;
+ if (thread_se->avg.hmp_load_avg >= hmp_down_threshold &&
+ hmp_cpu_is_fastest(task_cpu(thread_p))) {
+ return thread_p->pid;
+ }
+ }
+ return 0;
+}
+
+static inline unsigned int hmp_best_little_cpu(struct task_struct *tsk,
+ int cpu) {
+ int lowest_cpu;
+ int lowest_ratio;
+ struct hmp_domain *hmp;
+ struct cpumask allowed_hmp_cpus;
+
+ if (hmp_cpu_is_slowest(cpu))
+ hmp = hmp_cpu_domain(cpu);
+ else
+ hmp = hmp_slower_domain(cpu);
+
+#ifdef CONFIG_SCHED_SKIP_CORE_SELECTION_MASK
+ cpumask_xor(&allowed_hmp_cpus, &hmp->cpus,
+ &hmp->cpumask_skip);
+#else
+ cpumask_copy(&allowed_hmp_cpus, &hmp->cpus);
+#endif
+ cpumask_and(&allowed_hmp_cpus, &allowed_hmp_cpus,
+ tsk_cpus_allowed(tsk));
+
+ lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+ &allowed_hmp_cpus);
+ if (lowest_ratio < hmp_packing_threshold)
+ return lowest_cpu;
+
+ return hmp_select_slower_cpu(tsk, cpu);
+}
+#else
+int set_hmp_boost(int enable)
+{
+ pr_err("It doesn't support the HMP boost\n");
+
+ return 0;
+}
+#endif /* CONFIG_SCHED_HMP */
+
+#ifndef CONFIG_SCHED_USE_FLUID_RT
/*
* cpu_util returns the amount of capacity of a CPU that is used by CFS
* tasks. The unit of the return value must be the one of capacity so we can
@@ -4981,7 +6846,7 @@
return (util >= capacity) ? capacity : util;
}
-
+#endif
/*
* select_task_rq_fair: Select target runqueue for the waking task in domains
* that have the 'sd_flag' flag set. In practice, this is SD_BALANCE_WAKE,
@@ -5002,9 +6867,17 @@
int new_cpu = prev_cpu;
int want_affine = 0;
int sync = wake_flags & WF_SYNC;
+#ifdef CONFIG_SCHED_HMP
+ int thread_pid;
+#endif
if (sd_flag & SD_BALANCE_WAKE)
+#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
+ want_affine = !wake_wide(p) && task_fits_max(p, cpu) &&
+ cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+#else
want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, tsk_cpus_allowed(p));
+#endif
rcu_read_lock();
for_each_domain(cpu, tmp) {
@@ -5073,6 +6946,115 @@
}
rcu_read_unlock();
+#ifdef CONFIG_SCHED_HMP
+ if (hmp_family_boost() && p->parent && p->parent->pid > 2) {
+ int lowest_ratio = 0;
+ thread_pid = hmp_is_family_in_fastest_domain(p->group_leader);
+ if (thread_pid) {
+ if (hmp_cpu_is_slowest(prev_cpu)) {
+ /* hmp_domain_min_load only returns 0 for an
+ * idle CPU or 1023 for any partly-busy one.
+ * Be explicit about requirement for an idle CPU.
+ */
+ trace_sched_hmp_migrate(p, thread_pid, HMP_MIGRATE_INFORM);
+ new_cpu = hmp_select_faster_cpu(p, prev_cpu, &lowest_ratio);
+ if (lowest_ratio == 0) {
+ hmp_next_up_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_FAMILY);
+ return new_cpu;
+ }
+ /* failed to perform HMP fork balance, use normal balance */
+ new_cpu = prev_cpu;
+ } else {
+ /* Make sure that the task stays in its previous hmp domain */
+ trace_sched_hmp_migrate(p, thread_pid, HMP_MIGRATE_INFORM);
+ new_cpu = hmp_select_faster_cpu(p, prev_cpu, &lowest_ratio);
+ trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_FAMILY);
+
+ return new_cpu;
+ }
+
+ }
+ }
+
+ prev_cpu = task_cpu(p);
+
+ if (hmp_up_migration(prev_cpu, &new_cpu, &p->se)) {
+ hmp_next_up_delay(&p->se, new_cpu);
+ trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+ return new_cpu;
+ }
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ if(!hmp_boost() && hmp_selective_boost()) {
+ int cpu;
+
+ cpu = hmp_selective_migration(prev_cpu, &p->se);
+ if (cpu < NR_CPUS) {
+ new_cpu = cpu;
+
+ /* just copied from hmp_down_migration() below
+ * to follow up original boost scheme without cpu selection parts.
+ * in this case, condition changed a little bit from original because
+ * we should do hmp_next_down_delay() and others when a task whose
+ * prev_cpu is fastest allocates slowest cpu in turn.
+ */
+ if (new_cpu < NR_CPUS
+ && hmp_cpu_is_fastest(prev_cpu)
+ && hmp_cpu_is_slowest(new_cpu)) {
+ hmp_next_down_delay(&p->se, new_cpu);
+ #ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+ /*
+ * if load_avg_ratio is higher than hmp_tbsoftlanding.threshold,
+ * request pm_qos to slower domain for performance compensation
+ */
+ if (hmp_tbsoftlanding.enabled &&
+ p->se.avg.hmp_load_avg >= hmp_tbsoftlanding.threshold) {
+ hmp_do_tbsoftlanding(new_cpu, p->se.avg.hmp_load_avg);
+ trace_sched_hmp_task_based_softlanding(p, new_cpu,
+ HMP_MIGRATE_WAKEUP, p->se.avg.hmp_load_avg);
+ }
+ #endif
+ trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+ }
+
+ return new_cpu;
+ }
+ }
+#endif
+ if (hmp_down_migration(prev_cpu, &p->se)) {
+ new_cpu = hmp_best_little_cpu(p, prev_cpu);
+ /*
+ * we might have no suitable CPU
+ * in which case new_cpu == NR_CPUS
+ */
+ if (new_cpu < NR_CPUS && new_cpu != prev_cpu) {
+ hmp_next_down_delay(&p->se, new_cpu);
+#ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+ if (hmp_cpu_is_slowest(prev_cpu) && hmp_cpu_is_slowest(new_cpu)) {
+ trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+ return new_cpu;
+ }
+
+ /*
+ * if load_avg_ratio is higher than hmp_tbsoftlanding.threshold,
+ * request pm_qos to slower domain for performance compensation
+ */
+ if (hmp_tbsoftlanding.enabled &&
+ p->se.avg.hmp_load_avg >= hmp_tbsoftlanding.threshold) {
+ hmp_do_tbsoftlanding(new_cpu, p->se.avg.hmp_load_avg);
+ trace_sched_hmp_task_based_softlanding(p, new_cpu,
+ HMP_MIGRATE_WAKEUP, p->se.avg.hmp_load_avg);
+ }
+#endif
+ trace_sched_hmp_migrate(p, new_cpu, HMP_MIGRATE_WAKEUP);
+ return new_cpu;
+ }
+ }
+ /* Make sure that the task stays in its previous hmp domain */
+ if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(task_cpu(p))->cpus))
+ return task_cpu(p);
+#endif
+
return new_cpu;
}
@@ -5082,7 +7064,7 @@
* previous cpu. However, the caller only guarantees p->pi_lock is held; no
* other assumptions, including the state of rq->lock, should be made.
*/
-static void migrate_task_rq_fair(struct task_struct *p)
+static void migrate_task_rq_fair(struct task_struct *p, int next_cpu)
{
/*
* We are supposed to update the task to "current" time, then its up to date
@@ -5098,6 +7080,11 @@
/* We have migrated, no longer consider this task hot */
p->se.exec_start = 0;
+
+#if defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) && defined(CONFIG_SCHED_HMP)
+ /* To prevent performance drop, fit load of task to next_cpu */
+ hmp_load_migration(&p->se, task_cpu(p), next_cpu);
+#endif
}
static void task_dead_fair(struct task_struct *p)
@@ -5432,6 +7419,11 @@
if (curr->policy != SCHED_BATCH) {
update_rq_clock(rq);
+
+#ifdef CONFIG_SCHED_HMP
+ if (hmp_aggressive_yield && cfs_rq->curr)
+ cfs_rq->curr->exec_start -= YIELD_CORRECTION_TIME;
+#endif
/*
* Update run-time statistics of the 'current'.
*/
@@ -5534,7 +7526,7 @@
*
* The adjacency matrix of the resulting graph is given by:
*
- * log_2 n
+ * log_2 n
* A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
* k = 0
*
@@ -5580,7 +7572,7 @@
*
* [XXX write more on how we solve this.. _after_ merging pjt's patches that
* rewrite all of this once again.]
- */
+ */
static unsigned long __read_mostly max_load_balance_interval = HZ/10;
@@ -5617,6 +7609,22 @@
struct list_head tasks;
};
+#ifdef CONFIG_SCHED_HMP
+#ifdef MOVETASK_ONEPATH
+/*
+ * move_task - move a task from one runqueue to another runqueue.
+ * Both runqueues must be locked.
+ */
+static void move_task(struct task_struct *p, struct lb_env *env)
+{
+ deactivate_task(env->src_rq, p, 0);
+ set_task_cpu(p, env->dst_cpu);
+ activate_task(env->dst_rq, p, 0);
+ check_preempt_curr(env->dst_rq, p, 0);
+}
+#endif
+#endif
+
/*
* Is this task likely cache-hot:
*/
@@ -5828,6 +7836,47 @@
return NULL;
}
+#ifndef MOVETASK_ONEPATH
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env);
+/*
+ * detach_specific_task() -- tries to dequeue a specific task from env->src_rq, as
+ * part of active balancing operations within "domain".
+ *
+ * Returns 1 if successful and 0 otherwise.
+ */
+static int detach_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+ struct task_struct *p, *n;
+
+ lockdep_assert_held(&env->src_rq->lock);
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+ env->dst_cpu))
+ continue;
+
+ if (!hmp_can_migrate_task(p, env))
+ continue;
+
+ /* Check if we found the right task */
+ if (p != pm)
+ continue;
+
+ detach_task(p, env);
+
+ /*
+ * Right now, this is only the second place where
+ * lb_gained[env->idle] is updated (other is detach_tasks)
+ * so we can safely collect stats here rather than
+ * inside detach_tasks().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
+ }
+ return 0;
+}
+#endif
+
static const unsigned int sched_nr_migrate_break = 32;
/*
@@ -5983,9 +8032,10 @@
if (throttled_hierarchy(cfs_rq))
continue;
- if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq))
+ if (update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true))
update_tg_load_avg(cfs_rq, 0);
}
+
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6044,7 +8094,7 @@
raw_spin_lock_irqsave(&rq->lock, flags);
update_rq_clock(rq);
- update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+ update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq, true);
raw_spin_unlock_irqrestore(&rq->lock, flags);
}
@@ -6150,23 +8200,9 @@
static unsigned long scale_rt_capacity(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- u64 total, used, age_stamp, avg;
- s64 delta;
+ u64 used;
- /*
- * Since we're reading these variables without serialization make sure
- * we read them once before doing sanity checks on them.
- */
- age_stamp = READ_ONCE(rq->age_stamp);
- avg = READ_ONCE(rq->rt_avg);
- delta = __rq_clock_broken(rq) - age_stamp;
-
- if (unlikely(delta < 0))
- delta = 0;
-
- total = sched_avg_period() + delta;
-
- used = div_u64(avg, total);
+ used = rq->rt.avg.util_avg;
if (likely(used < SCHED_CAPACITY_SCALE))
return SCHED_CAPACITY_SCALE - used;
@@ -6242,7 +8278,7 @@
/*
* !SD_OVERLAP domains can assume that child groups
* span the current group.
- */
+ */
group = child->groups;
do {
@@ -7308,6 +9344,8 @@
*next_balance = next;
}
+static unsigned int hmp_idle_pull(int this_cpu);
+
/*
* idle_balance is called by schedule() if this_cpu is about to become
* idle. Attempts to pull tasks from other CPUs.
@@ -7380,6 +9418,11 @@
}
rcu_read_unlock();
+ // implement idle pull for HMP
+ if (!pulled_task && this_rq->nr_running == 0) {
+ pulled_task = hmp_idle_pull(this_cpu);
+ }
+
raw_spin_lock(&this_rq->lock);
if (curr_cost > this_rq->max_idle_balance_cost)
@@ -7500,13 +9543,39 @@
unsigned long next_balance; /* in jiffy units */
} nohz ____cacheline_aligned;
-static inline int find_new_ilb(void)
+static inline int find_new_ilb(int call_cpu)
{
+#ifdef CONFIG_SCHED_HMP
+ int lowest_cpu;
+ int lowest_ratio;
+ struct hmp_domain *hmp;
+ struct cpumask allowed_hmp_cpus;
int ilb = cpumask_first(nohz.idle_cpus_mask);
+ if (hmp_cpu_is_slowest(call_cpu))
+ hmp = hmp_slower_domain(call_cpu);
+ else
+ hmp = hmp_cpu_domain(call_cpu);
+
+ cpumask_copy(&allowed_hmp_cpus, &hmp->cpus);
+#ifdef CONFIG_SCHED_SKIP_CORE_SELECTION_MASK
+ cpumask_xor(&allowed_hmp_cpus, &hmp->cpus,
+ &hmp->cpumask_skip);
+
+#endif
+ lowest_ratio = hmp_domain_min_load(hmp, &lowest_cpu,
+ &allowed_hmp_cpus);
+ if (lowest_ratio < hmp_packing_threshold) {
+ ilb = cpumask_first_and(nohz.idle_cpus_mask, &allowed_hmp_cpus);
+ } else {
+ /* restrict nohz balancing to occur in the same hmp domain */
+ ilb = cpumask_first_and(nohz.idle_cpus_mask,
+ &((struct hmp_domain *)hmp_cpu_domain(call_cpu))->cpus);
+ }
+
if (ilb < nr_cpu_ids && idle_cpu(ilb))
return ilb;
-
+#endif
return nr_cpu_ids;
}
@@ -7515,13 +9584,13 @@
* nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
* CPU (if there is one).
*/
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(int cpu)
{
int ilb_cpu;
nohz.next_balance++;
- ilb_cpu = find_new_ilb();
+ ilb_cpu = find_new_ilb(cpu);
if (ilb_cpu >= nr_cpu_ids)
return;
@@ -7845,6 +9914,18 @@
if (time_before(now, nohz.next_balance))
return false;
+#ifdef CONFIG_SCHED_HMP
+ /*
+ * Bail out if there are no nohz CPUs in our
+ * HMP domain, since we will move tasks between
+ * domains through wakeup and force balancing
+ * as necessary based upon task load.
+ */
+ if (cpumask_first_and(nohz.idle_cpus_mask,
+ &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
+ return false;
+#endif
+
if (rq->nr_running >= 2)
return true;
@@ -7885,6 +9966,676 @@
static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
#endif
+
+#ifdef CONFIG_SCHED_HMP
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+static struct hmp_domain firstboost, secondboost;
+static struct hmp_domain logical_nonboost, logical_boost;
+
+static int hmp_selective_migration(int prev_cpu, struct sched_entity *se)
+{
+ int new_cpu = NR_CPUS;
+ int is_boosted_task;
+ int min_load, min_cpu;
+ struct task_struct *p;
+
+ p = container_of(se, struct task_struct, se);
+ is_boosted_task = cpuset_task_is_boosted(p);
+
+ /*
+ * NITP (non-important task packing)
+ * NITP is a kind of boosting scheme to accelerate tasks under
+ * consideration by decreasing waiting time of the tasks
+ * NITP divides CPUs to 2 groups (logical boost CPU group, logical non-boost
+ * CPU group). When a task wakes up or is newly forked, NITP delivers it
+ * to an non-boost CPU unconditionally not considering load imbalance.
+ * this load imbalance allows boost CPUs a higher chance to be idle state
+ * When an important task allocates this idle CPU, it can execute without
+ * waiting time
+ */
+ if (is_boosted_task) {
+ min_load = hmp_domain_min_load(&firstboost,&min_cpu, tsk_cpus_allowed(p));
+ if(min_load) {
+ min_load = hmp_domain_min_load(&secondboost,&min_cpu, tsk_cpus_allowed(p));
+ if(min_load)
+ min_load = hmp_domain_min_load(&logical_nonboost,&min_cpu, tsk_cpus_allowed(p));
+ }
+ } else {
+ min_load = hmp_domain_min_load(&logical_nonboost,&min_cpu, tsk_cpus_allowed(p));
+ }
+ new_cpu = min_cpu;
+
+ return new_cpu;
+}
+#endif
+
+/* Check if task should migrate to a faster cpu */
+static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ int temp_target_cpu;
+ unsigned int up_threshold;
+ unsigned int min_load;
+ u64 now;
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ int is_boosted_task =
+ cpuset_task_is_boosted(container_of(se, struct task_struct, se));
+#endif
+
+ if (hmp_cpu_is_fastest(cpu))
+ return 0;
+
+ if (!hmp_boost()) {
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ if (!hmp_selective_boost() || !is_boosted_task) {
+#endif
+ if (hmp_semiboost())
+ up_threshold = hmp_semiboost_up_threshold;
+ else
+ up_threshold = hmp_up_threshold;
+
+ if (se->avg.hmp_load_avg < up_threshold)
+ return 0;
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ }
+#endif
+ }
+
+ /* Let the task load settle before doing another up migration */
+ /* hack - always use clock from first online CPU */
+ now = cpu_rq(cpumask_first(cpu_online_mask))->clock_task;
+ if (((now - se->avg.hmp_last_up_migration) >> 10)
+ < hmp_next_up_threshold)
+ return 0;
+
+ if (se->avg.last_update_time)
+ hp_event_update(se);
+
+ /* hmp_domain_min_load only returns 0 for an
+ * idle CPU.
+ * Be explicit about requirement for an idle CPU.
+ * return value is weighted_cpuload (runnable_load_avg of cfs_rq
+ */
+ min_load = hmp_domain_min_load(hmp_faster_domain(cpu),
+ &temp_target_cpu, tsk_cpus_allowed(p));
+
+ if (temp_target_cpu != NR_CPUS) {
+ if (hmp_aggressive_up_migration) {
+ if (target_cpu)
+ *target_cpu = temp_target_cpu;
+ return 1;
+ } else {
+ if (min_load == 0) {
+ if (target_cpu)
+ *target_cpu = temp_target_cpu;
+ return 1;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/* Check if task should migrate to a slower cpu */
+static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
+{
+ struct task_struct *p = task_of(se);
+ u64 now;
+
+ if (hmp_cpu_is_slowest(cpu)) {
+ if (hmp_packing_enabled)
+ return 1;
+ else
+ return 0;
+ }
+
+ /* Let the task load settle before doing another down migration */
+ now = cpu_rq(cpu)->clock_task;
+ if (((now - se->avg.hmp_last_down_migration) >> 10)
+ < hmp_next_down_threshold)
+ return 0;
+
+ if (hmp_aggressive_up_migration) {
+ if (hmp_boost())
+ return 0;
+ } else {
+ if (hmp_domain_min_load(hmp_cpu_domain(cpu), NULL, NULL)) {
+ if (hmp_active_down_migration)
+ return 1;
+ } else if (hmp_boost()) {
+ return 0;
+ }
+ }
+
+ if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
+ tsk_cpus_allowed(p))) {
+ unsigned int down_threshold;
+
+ if (hmp_semiboost())
+ down_threshold = hmp_semiboost_down_threshold;
+ else
+ down_threshold = hmp_down_threshold;
+
+ if (se->avg.hmp_load_avg < down_threshold)
+ return 1;
+ }
+ return 0;
+}
+
+/*
+ * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
+ * Ideally this function should be merged with can_migrate_task() to avoid
+ * redundant code.
+ */
+static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
+{
+ int tsk_cache_hot = 0;
+
+ /*
+ * We do not migrate tasks that are:
+ * 1) running (obviously), or
+ * 2) cannot be migrated to this CPU due to cpus_allowed
+ */
+ if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
+ return 0;
+ }
+ env->flags &= ~LBF_ALL_PINNED;
+
+ if (task_running(env->src_rq, p)) {
+ schedstat_inc(p, se.statistics.nr_failed_migrations_running);
+ return 0;
+ }
+
+ /*
+ * Aggressive migration if:
+ * 1) task is cache cold, or
+ * 2) too many balance attempts have failed.
+ */
+
+ tsk_cache_hot = task_hot(p, env);
+ if (!tsk_cache_hot ||
+ env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
+#ifdef CONFIG_SCHEDSTATS
+ if (tsk_cache_hot) {
+ schedstat_inc(env->sd, lb_hot_gained[env->idle]);
+ schedstat_inc(p, se.statistics.nr_forced_migrations);
+ }
+#endif
+ return 1;
+ }
+
+ return 1;
+}
+
+#ifdef MOVETASK_ONEPATH
+/*
+ * move_specific_task tries to move a specific task.
+ * Returns 1 if successful and 0 otherwise.
+ * Called with both runqueues locked.
+ */
+static int move_specific_task(struct lb_env *env, struct task_struct *pm)
+{
+ struct task_struct *p, *n;
+
+ list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
+ if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
+ env->dst_cpu))
+ continue;
+
+ if (!hmp_can_migrate_task(p, env))
+ continue;
+ /* Check if we found the right task */
+ if (p != pm)
+ continue;
+
+ move_task(p, env);
+ /*
+ * Right now, this is only the third place move_task()
+ * is called, so we can safely collect move_task()
+ * stats here rather than inside move_task().
+ */
+ schedstat_inc(env->sd, lb_gained[env->idle]);
+ return 1;
+ }
+ return 0;
+}
+#endif
+
+static ATOMIC_NOTIFIER_HEAD(hmp_task_migration_notifier);
+
+int register_hmp_task_migration_notifier(struct notifier_block *nb)
+{
+ return atomic_notifier_chain_register(&hmp_task_migration_notifier, nb);
+}
+
+static int hmp_up_migration_noti(void)
+{
+ return atomic_notifier_call_chain(&hmp_task_migration_notifier,
+ HMP_UP_MIGRATION, NULL);
+}
+
+static int hmp_down_migration_noti(void)
+{
+ return atomic_notifier_call_chain(&hmp_task_migration_notifier,
+ HMP_DOWN_MIGRATION, NULL);
+}
+
+/*
+ * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_force_up_migration uses this to push a currently running task
+ * off a runqueue.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_active_task_migration_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+#ifndef MOVETASK_ONEPATH
+ int detached = 0;
+#endif
+ raw_spin_lock_irq(&busiest_rq->lock);
+
+ if (p->exit_state)
+ goto out_unlock;
+
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance)) {
+ goto out_unlock;
+ }
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1)
+ goto out_unlock;
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq)
+ goto out_unlock;
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+#ifdef MOVETASK_ONEPATH
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+#endif
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+#ifdef MOVETASK_ONEPATH
+ if (move_specific_task(&env, p)) {
+ schedstat_inc(sd, alb_pushed);
+ if (hmp_cpu_is_fastest(target_cpu))
+ hmp_up_migration_noti();
+ else if (hmp_cpu_is_slowest(target_cpu))
+ hmp_down_migration_noti();
+ } else {
+ schedstat_inc(sd, alb_failed);
+ }
+#else
+ detached = detach_specific_task(&env, p);
+ if (detached)
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+#endif
+ }
+ rcu_read_unlock();
+#ifdef MOVETASK_ONEPATH
+ double_unlock_balance(busiest_rq, target_rq);
+#endif
+out_unlock:
+ busiest_rq->active_balance = 0;
+#ifdef MOVETASK_ONEPATH
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ put_task_struct(p);
+#else
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (detached)
+ attach_one_task(target_rq, p);
+
+ if (hmp_cpu_is_fastest(target_cpu))
+ hmp_up_migration_noti();
+ else if (hmp_cpu_is_slowest(target_cpu))
+ hmp_down_migration_noti();
+
+ local_irq_enable();
+#endif
+ return 0;
+}
+
+/*
+ * hmp_idle_pull_cpu_stop is run by cpu stopper and used to
+ * migrate a specific task from one runqueue to another.
+ * hmp_idle_pull uses this to push a currently running task
+ * off a runqueue to a faster CPU.
+ * Locking is slightly different than usual.
+ * Based on active_load_balance_stop_cpu and can potentially be merged.
+ */
+static int hmp_idle_pull_cpu_stop(void *data)
+{
+ struct rq *busiest_rq = data;
+ struct task_struct *p = busiest_rq->migrate_task;
+ int busiest_cpu = cpu_of(busiest_rq);
+ int target_cpu = busiest_rq->push_cpu;
+ struct rq *target_rq = cpu_rq(target_cpu);
+ struct sched_domain *sd;
+#ifndef MOVETASK_ONEPATH
+ int detached = 0;
+#endif
+
+ raw_spin_lock_irq(&busiest_rq->lock);
+
+ if (p->exit_state)
+ goto out_unlock;
+
+ /* make sure the requested cpu hasn't gone down in the meantime */
+ if (unlikely(busiest_cpu != smp_processor_id() ||
+ !busiest_rq->active_balance)) {
+ goto out_unlock;
+ }
+ /* Is there any task to move? */
+ if (busiest_rq->nr_running <= 1) {
+ goto out_unlock;
+ }
+ /* Task has migrated meanwhile, abort forced migration */
+ if (task_rq(p) != busiest_rq) {
+ goto out_unlock;
+ }
+ /*
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
+ */
+ BUG_ON(busiest_rq == target_rq);
+
+#ifdef MOVETASK_ONEPATH
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
+#endif
+ /* Search for an sd spanning us and the target CPU. */
+ rcu_read_lock();
+ for_each_domain(target_cpu, sd) {
+ if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
+ break;
+ }
+ if (likely(sd)) {
+ struct lb_env env = {
+ .sd = sd,
+ .dst_cpu = target_cpu,
+ .dst_rq = target_rq,
+ .src_cpu = busiest_rq->cpu,
+ .src_rq = busiest_rq,
+ .idle = CPU_IDLE,
+ };
+
+ schedstat_inc(sd, alb_count);
+
+#ifdef MOVETASK_ONEPATH
+ if (move_specific_task(&env, p)) {
+ schedstat_inc(sd, alb_pushed);
+ hmp_up_migration_noti();
+ } else {
+ schedstat_inc(sd, alb_failed);
+ }
+#else
+ detached = detach_specific_task(&env, p);
+ if (detached)
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+#endif
+ }
+ rcu_read_unlock();
+#ifdef MOVETASK_ONEPATH
+ double_unlock_balance(busiest_rq, target_rq);
+#endif
+out_unlock:
+ busiest_rq->active_balance = 0;
+#ifdef MOVETASK_ONEPATH
+ raw_spin_unlock_irq(&busiest_rq->lock);
+ put_task_struct(p);
+#else
+ raw_spin_unlock(&busiest_rq->lock);
+
+ if (detached)
+ attach_one_task(target_rq, p);
+
+ if (hmp_cpu_is_fastest(target_cpu))
+ hmp_up_migration_noti();
+ else if (hmp_cpu_is_slowest(target_cpu))
+ hmp_down_migration_noti();
+
+ local_irq_enable();
+#endif
+ return 0;
+}
+
+static DEFINE_SPINLOCK(hmp_force_migration);
+
+/*
+ * hmp_force_up_migration checks runqueues for tasks that need to
+ * be actively migrated to a faster cpu.
+ */
+static void hmp_force_up_migration(int this_cpu)
+{
+ int cpu, target_cpu = NR_CPUS+1;
+ struct sched_entity *curr, *orig;
+ struct rq *target;
+ unsigned long flags;
+ unsigned int force;
+ struct task_struct *p;
+
+ if (!spin_trylock(&hmp_force_migration)) {
+ trace_printk("CPU%d FAILED TO GET MIGRATION SPINLOCK\n", this_cpu);
+ return;
+ }
+ trace_printk("hmp_force_up_migration spinlock TAKEN cpu=%d\n", this_cpu);
+
+ for_each_online_cpu(cpu) {
+ BUG_ON((target_cpu > NR_CPUS+1) || (target_cpu < 0));
+ force = 0;
+ target = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&target->lock, flags);
+ curr = target->cfs.curr;
+ if (!curr) {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ continue;
+ }
+ trace_printk("examining CPU%d\n", cpu);
+ if (!entity_is_task(curr)) {
+ struct cfs_rq *cfs_rq;
+
+ cfs_rq = group_cfs_rq(curr);
+ while (cfs_rq) {
+ curr = cfs_rq->curr;
+ cfs_rq = group_cfs_rq(curr);
+ }
+ }
+ orig = curr;
+ curr = hmp_get_heaviest_task(curr, 1);
+ p = task_of(curr);
+ if (hmp_up_migration(cpu, &target_cpu, curr)) {
+ if (!target->active_balance) {
+ get_task_struct(p);
+ target->active_balance = 1;
+ target->push_cpu = target_cpu;
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu,
+ HMP_MIGRATE_FORCE);
+ hmp_next_up_delay(&p->se, target->push_cpu);
+ }
+ }
+
+ if (!force && !target->active_balance) {
+ /*
+ * For now we just check the currently running task.
+ * Selecting the lightest task for offloading will
+ * require extensive book keeping.
+ */
+ curr = hmp_get_lightest_task(orig, 1);
+ p = task_of(curr);
+ target->push_cpu = hmp_offload_down(cpu, curr);
+ if (target->push_cpu < NR_CPUS) {
+#ifdef MOVETASK_ONEPATH
+ get_task_struct(p);
+#endif
+ target->active_balance = 1;
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu,
+ HMP_MIGRATE_OFFLOAD);
+ hmp_next_down_delay(&p->se, target->push_cpu);
+ }
+ }
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ if (force)
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_active_task_migration_cpu_stop,
+ target, &target->active_balance_work);
+ }
+
+ spin_unlock(&hmp_force_migration);
+
+ trace_printk("spinlock RELEASE cpu %d\n", this_cpu);
+}
+
+/*
+ * hmp_idle_pull looks at little domain runqueues to see
+ * if a task should be pulled.
+ *
+ * Reuses hmp_force_migration spinlock.
+ *
+ */
+static unsigned int hmp_idle_pull(int this_cpu)
+{
+ int cpu;
+ struct sched_entity *curr, *orig;
+ struct hmp_domain *hmp_domain = NULL;
+ struct rq *target, *rq;
+ unsigned long flags,ratio = 0;
+ unsigned int force=0;
+ unsigned int up_threshold;
+ struct task_struct *p = NULL;
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ int is_boosted_task = 0;
+#endif
+
+ if (!hmp_cpu_is_slowest(this_cpu))
+ hmp_domain = hmp_slower_domain(this_cpu);
+ if (!hmp_domain)
+ return 0;
+
+ if (!spin_trylock(&hmp_force_migration)) {
+ return 0;
+ }
+
+ /* first select a task */
+ for_each_cpu(cpu, &hmp_domain->cpus) {
+ rq = cpu_rq(cpu);
+ raw_spin_lock_irqsave(&rq->lock, flags);
+ curr = rq->cfs.curr;
+ if (!curr) {
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ continue;
+ }
+ if (!entity_is_task(curr)) {
+ struct cfs_rq *cfs_rq;
+
+ cfs_rq = group_cfs_rq(curr);
+ while (cfs_rq) {
+ curr = cfs_rq->curr;
+ if(!entity_is_task(curr))
+ cfs_rq = group_cfs_rq(curr);
+ else
+ cfs_rq = NULL;
+ }
+ }
+ orig = curr;
+ curr = hmp_get_heaviest_task(curr, 1);
+ if (hmp_semiboost())
+ up_threshold = hmp_semiboost_up_threshold;
+ else
+ up_threshold = hmp_up_threshold;
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ if (p != NULL)
+ is_boosted_task = cpuset_task_is_boosted(p);
+#endif
+ if (hmp_boost() || curr->avg.hmp_load_avg > up_threshold
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ || (hmp_selective_boost() && is_boosted_task)
+#endif
+ )
+ if (curr->avg.hmp_load_avg > ratio) {
+ if (p)
+ put_task_struct(p);
+ p = task_of(curr);
+ target = rq;
+ ratio = curr->avg.hmp_load_avg;
+ get_task_struct(p);
+ }
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+ }
+
+ if ( !p )
+ goto done;
+
+ /* now we have a candidate */
+ raw_spin_lock_irqsave(&target->lock, flags);
+ if (!target->active_balance && task_rq(p) == target) {
+ target->active_balance = 1;
+ target->push_cpu = this_cpu;
+ target->migrate_task = p;
+ force = 1;
+ trace_sched_hmp_migrate(p, target->push_cpu,
+ HMP_MIGRATE_IDLE_PULL);
+ hmp_next_up_delay(&p->se, target->push_cpu);
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ } else {
+ raw_spin_unlock_irqrestore(&target->lock, flags);
+ put_task_struct(p);
+ }
+
+ if (force) {
+ stop_one_cpu_nowait(cpu_of(target),
+ hmp_idle_pull_cpu_stop,
+ target, &target->active_balance_work);
+ }
+done:
+ spin_unlock(&hmp_force_migration);
+ return force;
+}
+
+#else
+static void hmp_force_up_migration(int this_cpu) { }
+static unsigned int hmp_idle_pull(int this_cpu) { return 0; }
+#endif /* CONFIG_SCHED_HMP */
+
/*
* run_rebalance_domains is triggered when needed from the scheduler tick.
* Also triggered for nohz idle balancing (with nohz_balancing_kick set).
@@ -7905,12 +10656,14 @@
*/
nohz_idle_balance(this_rq, idle);
rebalance_domains(this_rq, idle);
+
+ hmp_force_up_migration(this_rq->cpu);
}
/*
* Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
*/
-void trigger_load_balance(struct rq *rq)
+void trigger_load_balance(struct rq *rq, int cpu)
{
/* Don't need to rebalance while attached to NULL domain */
if (unlikely(on_null_domain(rq)))
@@ -7920,12 +10673,15 @@
raise_softirq(SCHED_SOFTIRQ);
#ifdef CONFIG_NO_HZ_COMMON
if (nohz_kick_needed(rq))
- nohz_balancer_kick();
+ nohz_balancer_kick(cpu);
#endif
}
static void rq_online_fair(struct rq *rq)
{
+#ifdef CONFIG_SCHED_HMP
+ hmp_online_cpu(rq->cpu);
+#endif
update_sysctl();
update_runtime_enabled(rq);
@@ -7933,6 +10689,9 @@
static void rq_offline_fair(struct rq *rq)
{
+#ifdef CONFIG_SCHED_HMP
+ hmp_offline_cpu(rq->cpu);
+#endif
update_sysctl();
/* Ensure any throttled groups are reachable by pick_next_task */
@@ -7956,6 +10715,8 @@
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
+
+ hp_event_update(&curr->se);
}
/*
@@ -8095,13 +10856,29 @@
se->vruntime += cfs_rq->min_vruntime;
}
+#ifdef CONFIG_SMP
+void copy_sched_avg(struct sched_avg *from, struct sched_avg *to, unsigned int ratio)
+{
+ to->last_update_time = from->last_update_time;
+ to->util_avg = (from->util_avg * ratio) / 100;
+ to->util_sum = (from->util_sum * ratio) / 100;
+ to->load_avg = (from->load_avg * ratio) / 100;
+ to->load_sum = (from->load_sum * ratio) / 100;
+}
+#else
+void copy_sched_avg(struct sched_avg *from, struct sched_avg *to, unsigned int ratio) { }
+#endif
+
static void switched_from_fair(struct rq *rq, struct task_struct *p)
{
detach_task_cfs_rq(p);
}
+extern unsigned int sched_switch_to_fair_load_ratio;
+
static void switched_to_fair(struct rq *rq, struct task_struct *p)
{
+ copy_sched_avg(&p->rt.avg, &p->se.avg, sched_switch_to_fair_load_ratio);
attach_task_cfs_rq(p);
if (task_on_rq_queued(p)) {
@@ -8170,8 +10947,11 @@
for_each_possible_cpu(i) {
if (tg->cfs_rq)
kfree(tg->cfs_rq[i]);
- if (tg->se)
+ if (tg->se) {
+ if (tg->se[i])
+ remove_entity_load_avg(tg->se[i]);
kfree(tg->se[i]);
+ }
}
kfree(tg->cfs_rq);
@@ -8323,7 +11103,7 @@
return 1;
}
-void unregister_fair_sched_group(struct task_group *tg) { }
+void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -8429,6 +11209,467 @@
zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
cpu_notifier(sched_ilb_notifier, 0);
#endif
+
+#ifdef CONFIG_SCHED_HMP
+ hmp_cpu_mask_setup();
+#endif
#endif /* SMP */
}
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
+{
+ if ((curr >> SCHED_FREQSCALE_SHIFT) > max)
+ /**
+ * In out-of-sync state, max has limited value but curr is
+ * not updated to limited value.
+ * To avoid calculating in above status, return 1 as equivalent
+ * factor with shifting.
+ */
+ return 1 << SCHED_FREQSCALE_SHIFT;
+ else
+ return curr / max;
+}
+
+static void extents_update_max_min(struct cpufreq_extents *extents)
+{
+#ifdef CONFIG_CPU_THERMAL_IPA
+ extents->min = max(extents->cpufreq_min, extents->thermal_min);
+ extents->max = min(extents->cpufreq_max, extents->thermal_max);
+#else
+ extents->min = extents->cpufreq_min;
+ extents->max = extents->cpufreq_max;
+#endif
+}
+
+/* Called when the CPU Frequency is changed.
+ * Once for each CPU.
+ */
+static int cpufreq_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+ int cpu = freq->cpu;
+ struct cpufreq_extents *extents;
+
+ if (freq->flags & CPUFREQ_CONST_LOOPS)
+ return NOTIFY_OK;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return NOTIFY_OK;
+
+ /* if dynamic load scale is disabled, set the load scale to 1.0 */
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ freq_scale[cpu].curr_scale = 1024;
+ return NOTIFY_OK;
+ }
+
+ extents = &freq_scale[cpu];
+ if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
+ /* If our governor was recognised as a single-freq governor,
+ * use 1.0
+ */
+ extents->curr_scale = 1024;
+ } else {
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, freq->new);
+ }
+
+ return NOTIFY_OK;
+}
+
+/* Called when the CPUFreq governor is changed.
+ * Only called for the CPUs which are actually changed by the
+ * userspace.
+ */
+static int cpufreq_policy_callback(struct notifier_block *nb,
+ unsigned long event, void *data)
+{
+ struct cpufreq_policy *policy = data;
+ struct cpufreq_extents *extents;
+ int cpu, singleFreq = 0;
+ static const char performance_governor[] = "performance";
+ static const char powersave_governor[] = "powersave";
+
+ if (event == CPUFREQ_START)
+ return 0;
+
+ if (event != CPUFREQ_ADJUST)
+ return 0;
+
+ if (!policy->governor)
+ return 0;
+
+ /* CPUFreq governors do not accurately report the range of
+ * CPU Frequencies they will choose from.
+ * We recognise performance and powersave governors as
+ * single-frequency only.
+ */
+ if (!strncmp(policy->governor->name, performance_governor,
+ strlen(performance_governor)) ||
+ !strncmp(policy->governor->name, powersave_governor,
+ strlen(powersave_governor)))
+ singleFreq = 1;
+
+ /* Make sure that all CPUs impacted by this policy are
+ * updated since we will only get a notification when the
+ * user explicitly changes the policy on a CPU.
+ */
+ for_each_cpu(cpu, policy->cpus) {
+ extents = &freq_scale[cpu];
+ extents->cpufreq_max = policy->max >> SCHED_FREQSCALE_SHIFT;
+ extents->cpufreq_min = policy->min >> SCHED_FREQSCALE_SHIFT;
+ extents_update_max_min(extents);
+
+ if (!hmp_data.freqinvar_load_scale_enabled) {
+ extents->curr_scale = 1024;
+ } else if (singleFreq) {
+ extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = 1024;
+ } else {
+ extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
+ extents->curr_scale = cpufreq_calc_scale(extents->min,
+ extents->max, policy->cur);
+ }
+ }
+
+ return 0;
+}
+
+static struct notifier_block cpufreq_notifier = {
+ .notifier_call = cpufreq_callback,
+};
+static struct notifier_block cpufreq_policy_notifier = {
+ .notifier_call = cpufreq_policy_callback,
+};
+static int __init register_sched_cpufreq_notifier(void)
+{
+ int ret = 0;
+
+ /* init safe defaults since there are no policies at registration */
+ for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
+ /* safe defaults */
+ freq_scale[ret].cpufreq_max = 1024;
+ freq_scale[ret].cpufreq_min = 1024;
+ freq_scale[ret].thermal_max = UINT_MAX;
+ freq_scale[ret].thermal_min = 0;
+ freq_scale[ret].curr_scale = 1024;
+
+ extents_update_max_min(&freq_scale[ret]);
+ }
+
+ pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
+ ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
+ CPUFREQ_POLICY_NOTIFIER);
+
+ if (ret != -EINVAL)
+ ret = cpufreq_register_notifier(&cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+ return ret;
+}
+
+core_initcall(register_sched_cpufreq_notifier);
+
+#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
+
+#if defined(CONFIG_SCHED_HMP)
+static int __init hmp_param_init(void)
+{
+#if defined(CONFIG_OF)
+ struct device_node *hmp_param_node;
+ unsigned int duration = 0;
+
+ hmp_param_node = of_find_node_by_path("/cpus/hmp");
+ if (!hmp_param_node) {
+ pr_warn("%s hmp node is not exist!\n",__func__);
+ return -ENOENT;
+ }
+
+ if (of_property_read_u32(hmp_param_node,
+ "up_threshold", &hmp_up_threshold))
+ pr_warn("%s missing up_threshold property\n",__func__);
+
+ if (of_property_read_u32(hmp_param_node,
+ "down_threshold", &hmp_down_threshold))
+ pr_warn("%s missing down_threshold property\n",__func__);
+
+ if (!of_property_read_u32(hmp_param_node,
+ "bootboost-duration-us", &duration)) {
+ hmp_boostpulse_endtime = ktime_to_us(ktime_get()) + duration;
+ pr_info("hmp_boostpulse_endtime is set(%llu)\n",hmp_boostpulse_endtime);
+ }
+
+ if (of_property_read_u32(hmp_param_node,
+ "semiboost_up_threshold",&hmp_semiboost_up_threshold))
+ pr_warn("%s missing semiboost_up_threshold property\n",__func__);
+
+ if (of_property_read_u32(hmp_param_node,
+ "semiboost_down_threshold", &hmp_semiboost_down_threshold))
+ pr_warn("%s missing semiboost_down_threshold property\n",__func__);
+#if defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL)
+ if (of_property_read_u64(hmp_param_node,
+ "up_compst_ratio", &hmp_up_compst_ratio))
+ pr_warn("%s missing up_compst_ratio property\n",__func__);
+
+ if (of_property_read_u64(hmp_param_node,
+ "down_compst_ratio", &hmp_down_compst_ratio))
+ pr_warn("%s missing down_compst_ratio property\n",__func__);
+#endif /* CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
+#else
+ hmp_boostpulse_endtime = ktime_to_us(ktime_get()) + BOOT_BOOST_DURATION;
+#endif
+#ifdef CONFIG_SCHED_HMP_SELECTIVE_BOOST_WITH_NITP
+ cpumask_clear(&firstboost.cpus);
+ cpumask_set_cpu(6, &firstboost.cpus);
+ cpumask_set_cpu(7, &firstboost.cpus);
+
+ cpumask_clear(&secondboost.cpus);
+ cpumask_set_cpu(4, &secondboost.cpus);
+ cpumask_set_cpu(5, &secondboost.cpus);
+
+ cpumask_clear(&logical_nonboost.cpus);
+ cpumask_set_cpu(0, &logical_nonboost.cpus);
+ cpumask_set_cpu(1, &logical_nonboost.cpus);
+ cpumask_set_cpu(2, &logical_nonboost.cpus);
+ cpumask_set_cpu(3, &logical_nonboost.cpus);
+
+ cpumask_clear(&logical_boost.cpus);
+ cpumask_set_cpu(4, &logical_boost.cpus);
+ cpumask_set_cpu(5, &logical_boost.cpus);
+ cpumask_set_cpu(6, &logical_boost.cpus);
+ cpumask_set_cpu(7, &logical_boost.cpus);
+#endif
+ return 0;
+}
+pure_initcall(hmp_param_init);
+#endif
+
+#ifdef CONFIG_SCHED_HP_EVENT
+#include <soc/samsung/exynos-cpu_hotplug.h>
+enum {
+ BIG_NORMAL = 0,
+ BIG_BOOST = 1,
+};
+
+static int hmp_big_status = BIG_NORMAL;
+
+static unsigned int big_boost_threshold = 500;
+static unsigned int big_heavy_time = 100; /* ms */
+
+static struct task_struct *monitor_task;
+static u64 monitor_start_time;
+
+DEFINE_RAW_SPINLOCK(hp_event_lock);
+
+static unsigned int hmp_domain_sum_nr_running(struct hmp_domain *hmpd)
+{
+ int cpu;
+ int sum = 0;
+
+ for_each_cpu(cpu, &hmpd->cpus)
+ sum += cpu_rq(cpu)->nr_running;
+
+ return sum;
+}
+
+static bool need_big_boost(struct sched_entity *se)
+{
+ int cpu = raw_smp_processor_id();
+ int big_nr_running = hmp_domain_sum_nr_running(hmp_faster_domain(0));
+ u64 now;
+
+ if (hmp_cpu_is_slowest(cpu))
+ return false;
+
+ /* Too many tasks or low load */
+ if (big_nr_running > hpgov_default_level()
+ || se->avg.hmp_load_avg < big_boost_threshold) {
+ monitor_task = NULL;
+ monitor_start_time = 0;
+ trace_sched_hp_event(cpu, se->avg.hmp_load_avg, big_nr_running,
+ "Too many tasks or low load");
+ return false;
+ }
+
+ now = sched_clock_cpu(cpu);
+
+ /* New heavy task, start monitoring */
+ if (!monitor_task || monitor_task != task_of(se)) {
+ monitor_task = task_of(se);
+ monitor_start_time = now;
+ trace_sched_hp_event(cpu, se->avg.hmp_load_avg, big_nr_running,
+ "New heavy task");
+ return false;
+ }
+
+ /* Need more time to execute task heavily */
+ if (now - monitor_start_time < big_heavy_time * NSEC_PER_MSEC) {
+ trace_sched_hp_event(cpu, se->avg.hmp_load_avg,
+ (now - monitor_start_time) / NSEC_PER_MSEC,
+ "Not enough time");
+ return false;
+ }
+
+ /* Switch to boost mode, re-initializing */
+ monitor_task = NULL;
+ monitor_start_time = 0;
+ trace_sched_hp_event(cpu, se->avg.hmp_load_avg, big_nr_running,
+ "Switch to boost mode");
+
+ return true;
+}
+
+static bool need_big_normal(struct sched_entity *se)
+{
+ int big_nr_running;
+ int cpu = raw_smp_processor_id();
+
+ /* Do not consider big task */
+ if (!hmp_cpu_is_slowest(cpu))
+ return false;
+
+ big_nr_running = hmp_domain_sum_nr_running(hmp_faster_domain(0));
+
+ if (se->avg.hmp_load_avg > hmp_up_threshold && big_nr_running > 0) {
+ trace_sched_hp_event(cpu, se->avg.hmp_load_avg, big_nr_running,
+ "Switch to normal mode");
+ return true;
+ }
+
+ return false;
+}
+
+void hp_event_update(struct sched_entity *se)
+{
+ unsigned long flags;
+
+ if (!raw_spin_trylock_irqsave(&hp_event_lock, flags))
+ return;
+
+ switch (hmp_big_status) {
+ case BIG_NORMAL:
+ if (need_big_boost(se)) {
+ inc_boost_req_count();
+ hmp_big_status = BIG_BOOST;
+ }
+ break;
+ case BIG_BOOST:
+ if (need_big_normal(se)) {
+ dec_boost_req_count(true);
+ hmp_big_status = BIG_NORMAL;
+ }
+ break;
+ }
+
+ raw_spin_unlock_irqrestore(&hp_event_lock, flags);
+}
+#endif
+
+#ifdef CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING
+static void hmp_tbsoftlanding_irq_work(struct irq_work *irq_work)
+{
+ queue_work(hmp_tbsoftlanding.workqueue, &hmp_tbsoftlanding.work);
+}
+
+static void hmp_tbsoftlanding_work(struct work_struct *work)
+{
+ int lv = hmp_tbsoftlanding.last_lv;
+
+ hmp_tbsoftlanding.last_lv = TBSL_LV_LOW;
+
+ pm_qos_update_request_timeout(&hmp_tbsoftlanding.data[lv].pm_qos,
+ hmp_tbsoftlanding.data[lv].freq,
+ hmp_tbsoftlanding.timeout * USEC_PER_MSEC);
+}
+
+static void hmp_do_tbsoftlanding(int cpu, unsigned long load)
+{
+ int lv;
+
+ /* find proper compensation level */
+ for (lv = 0; lv < TBSL_LV_END; lv++)
+ if (load >= hmp_tbsoftlanding.data[lv].threshold)
+ break;
+
+ if (lv < hmp_tbsoftlanding.last_lv)
+ hmp_tbsoftlanding.last_lv = lv;
+
+ irq_work_queue_on(&hmp_tbsoftlanding.irq_work, cpu);
+}
+
+static void hmp_tbsoftlanding_update_thr(void)
+{
+ int temp, lv;
+
+ temp = (hmp_down_threshold - hmp_tbsoftlanding.threshold) / TBSL_LV_END;
+ for (lv = 0; lv < TBSL_LV_END; lv++)
+ /* compensation range is divided fairly by compensation level */
+ hmp_tbsoftlanding.data[lv].threshold =
+ hmp_tbsoftlanding.threshold + (temp * (TBSL_LV_END - lv - 1));
+}
+
+static int __init hmp_tbsoftlanding_init(void)
+{
+ struct device_node *hmp_param_node;
+ int lv;
+
+ hmp_tbsoftlanding.enabled = 0;
+
+ hmp_param_node = of_find_node_by_path("/cpus/hmp");
+ if (!hmp_param_node) {
+ pr_warn("%s hmp node is not exist!\n",__func__);
+ return -ENOENT;
+ }
+
+ if (of_property_read_u32(hmp_param_node, "down_compensation_timeout",
+ &hmp_tbsoftlanding.timeout))
+ pr_warn("%s missing hmp tbsoftlanding_timeout property\n",__func__);
+
+ if (of_property_read_u32(hmp_param_node, "down_compensation_high_freq",
+ &hmp_tbsoftlanding.data[TBSL_LV_HIGH].freq))
+ pr_warn("%s missing hmp tbsoftlanding_high_freq property\n",__func__);
+
+ if (of_property_read_u32(hmp_param_node, "down_compensation_mid_freq",
+ &hmp_tbsoftlanding.data[TBSL_LV_MID].freq))
+ pr_warn("%s missing hmp tbsoftlanding_mid_freq property\n",__func__);
+
+ if (of_property_read_u32(hmp_param_node, "down_compensation_low_freq",
+ &hmp_tbsoftlanding.data[TBSL_LV_LOW].freq))
+ pr_warn("%s missing hmp tbsoftlanding_low_freq property\n",__func__);
+
+ /*
+ * calculate threshold. base threshold is half of down threshold
+ * because load is decayed about 1/2 for 32ms.
+ */
+ hmp_tbsoftlanding.threshold = hmp_down_threshold / 2;
+ hmp_tbsoftlanding_update_thr();
+
+ for (lv = 0; lv < TBSL_LV_END; lv++)
+ pm_qos_add_request(&hmp_tbsoftlanding.data[lv].pm_qos,
+ PM_QOS_CLUSTER0_FREQ_MIN, 0);
+
+ init_irq_work(&hmp_tbsoftlanding.irq_work, hmp_tbsoftlanding_irq_work);
+ INIT_WORK(&hmp_tbsoftlanding.work, hmp_tbsoftlanding_work);
+ hmp_tbsoftlanding.workqueue = alloc_workqueue("%s", WQ_HIGHPRI | WQ_UNBOUND |\
+ WQ_MEM_RECLAIM | WQ_FREEZABLE,
+ 1, "hmp_down_compensation_workq");
+
+ /* print hmp down-migration compensation information */
+ pr_info("HMP: down-migration compensation initialized \n");
+ pr_info("HMP: tbsoftlanding: base threshold:%d, timeout: %d \n",
+ hmp_tbsoftlanding.threshold, hmp_tbsoftlanding.timeout);
+ for (lv = 0; lv < TBSL_LV_END; lv++)
+ pr_info("HMP: tbsoftlanding: lv:%d, freq: %d, threshold: %d \n",
+ lv, hmp_tbsoftlanding.data[lv].freq,
+ hmp_tbsoftlanding.data[lv].threshold);
+
+ hmp_tbsoftlanding.last_lv = TBSL_LV_LOW;
+ hmp_tbsoftlanding.enabled = 1;
+
+ return 0;
+}
+late_initcall(hmp_tbsoftlanding_init);
+#endif /* CONFIG_SCHED_HMP_TASK_BASED_SOFTLANDING */
+
+
diff --git a/kernel/sched/features.h b/kernel/sched/features.h
old mode 100644
new mode 100755
index 69631fa..55e4610
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -69,3 +69,12 @@
SCHED_FEAT(LB_MIN, false)
SCHED_FEAT(ATTACH_AGE_LOAD, true)
+/*
+ * Energy aware scheduling. Use platform energy model to guide scheduling
+ * decisions optimizing for energy efficiency.
+ */
+#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE
+SCHED_FEAT(ENERGY_AWARE, true)
+#else
+SCHED_FEAT(ENERGY_AWARE, false)
+#endif
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
old mode 100644
new mode 100755
index bfd5731..306a859
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -219,6 +219,7 @@
*/
__current_set_polling();
+ quiet_vmstat();
tick_nohz_idle_enter();
while (!need_resched()) {
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/loadavg.c b/kernel/sched/loadavg.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
old mode 100644
new mode 100755
index 801b4ec..bf49e97
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -7,6 +7,7 @@
#include <linux/slab.h>
#include <linux/irq_work.h>
+#include <trace/events/sched.h>
int sched_rr_timeslice = RR_TIMESLICE;
@@ -14,6 +15,9 @@
struct rt_bandwidth def_rt_bandwidth;
+unsigned int sched_switch_to_rt_load_ratio;
+unsigned int sched_switch_to_fair_load_ratio;
+
static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
{
struct rt_bandwidth *rt_b =
@@ -64,6 +68,10 @@
raw_spin_unlock(&rt_b->rt_runtime_lock);
}
+#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI)
+static void push_irq_work_func(struct irq_work *work);
+#endif
+
void init_rt_rq(struct rt_rq *rt_rq)
{
struct rt_prio_array *array;
@@ -83,6 +91,15 @@
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
+ atomic_long_set(&rt_rq->removed_util_avg, 0);
+ atomic_long_set(&rt_rq->removed_load_avg, 0);
+
+#ifdef HAVE_RT_PUSH_IPI
+ rt_rq->push_flags = 0;
+ rt_rq->push_cpu = nr_cpu_ids;
+ raw_spin_lock_init(&rt_rq->push_lock);
+ init_irq_work(&rt_rq->push_work, push_irq_work_func);
+#endif
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
@@ -99,16 +116,6 @@
hrtimer_cancel(&rt_b->rt_period_timer);
}
-#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
-
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-#ifdef CONFIG_SCHED_DEBUG
- WARN_ON_ONCE(!rt_entity_is_task(rt_se));
-#endif
- return container_of(rt_se, struct task_struct, rt);
-}
-
static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
{
return rt_rq->rq;
@@ -249,6 +256,274 @@
#ifdef CONFIG_SMP
+/*
+ * Signed add and clamp on underflow.
+ *
+ * Explicitly do a load-store to ensure the intermediate value never hits
+ * memory. This allows lockless observations without ever seeing the negative
+ * values.
+ */
+#define add_positive(_ptr, _val) do { \
+ typeof(_ptr) ptr = (_ptr); \
+ typeof(_val) val = (_val); \
+ typeof(*ptr) res, var = READ_ONCE(*ptr); \
+ \
+ res = var + val; \
+ \
+ if (val < 0 && res > var) \
+ res = 0; \
+ \
+ WRITE_ONCE(*ptr, res); \
+} while (0)
+
+#define entity_is_task(se) (!se->my_q)
+#define LOAD_AVG_MAX 47742
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+u64 decay_load(u64 val, u64 n);
+u32 __compute_runnable_contrib(u64 n);
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series. To do this we sub-divide our runnable
+ * history into segments of approximately 1ms (1024us); label the segment that
+ * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
+ *
+ * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
+ * p0 p1 p2
+ * (now) (~1ms ago) (~2ms ago)
+ *
+ * Let u_i denote the fraction of p_i that the entity was runnable.
+ *
+ * We then designate the fractions u_i as our co-efficients, yielding the
+ * following representation of historical load:
+ * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
+ *
+ * We choose y based on the with of a reasonably scheduling period, fixing:
+ * y^32 = 0.5
+ *
+ * This means that the contribution to load ~32ms ago (u_32) will be weighted
+ * approximately half as much as the contribution to load within the last ms
+ * (u_0).
+ *
+ * When a period "rolls over" and we have new u_0`, multiplying the previous
+ * sum again by y is sufficient to update:
+ * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
+ * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
+ */
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct rt_rq *rt_rq)
+{
+ u64 delta, scaled_delta, periods;
+ u32 contrib;
+ unsigned int delta_w, scaled_delta_w, decayed = 0;
+ unsigned long scale_freq, scale_cpu;
+
+ delta = now - sa->last_update_time;
+ /*
+ * This should only happen when time goes backwards, which it
+ * unfortunately does during sched clock init when we swap over to TSC.
+ */
+ if ((s64)delta < 0) {
+ sa->last_update_time = now;
+ return 0;
+ }
+
+ /*
+ * Use 1024ns as the unit of measurement since it's a reasonable
+ * approximation of 1us and fast to compute.
+ */
+ delta >>= 10;
+ if (!delta)
+ return 0;
+ sa->last_update_time = now;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ /* delta_w is the amount already accumulated against our next period */
+ delta_w = sa->period_contrib;
+ if (delta + delta_w >= 1024) {
+ decayed = 1;
+
+ /* how much left for next period will start over, we don't know yet */
+ sa->period_contrib = 0;
+
+ /*
+ * Now that we know we're crossing a period boundary, figure
+ * out how much from delta we need to complete the current
+ * period and accrue it.
+ */
+ delta_w = 1024 - delta_w;
+ scaled_delta_w = cap_scale(delta_w, scale_freq);
+ if (weight)
+ sa->load_sum += weight * scaled_delta_w;
+ if (running)
+ sa->util_sum += scaled_delta_w * scale_cpu;
+
+ delta -= delta_w;
+
+ /* Figure out how many additional periods this update spans */
+ periods = delta / 1024;
+ delta %= 1024;
+
+ sa->load_sum = decay_load(sa->load_sum, periods + 1);
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods + 1);
+
+ /* Efficiently calculate \sum (1..n_period) 1024*y^i */
+ contrib = __compute_runnable_contrib(periods);
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight)
+ sa->load_sum += weight * contrib;
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+ }
+
+ /* Remainder of delta accrued against u_0` */
+ scaled_delta = cap_scale(delta, scale_freq);
+ if (weight)
+ sa->load_sum += weight * scaled_delta;
+ if (running)
+ sa->util_sum += scaled_delta * scale_cpu;
+
+ sa->period_contrib += delta;
+
+ if (decayed) {
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX);
+ sa->util_avg = sa->util_sum / LOAD_AVG_MAX;
+ }
+
+ return decayed;
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_rt_util(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *grt_rq = rt_se->my_q;
+ long delta = grt_rq->avg.util_avg - rt_se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_rt_entity's utilization */
+ rt_se->avg.util_avg = grt_rq->avg.util_avg;
+ rt_se->avg.util_sum = rt_se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent rt_rq utilization */
+ add_positive(&rt_rq->avg.util_avg, delta);
+ rt_rq->avg.util_sum = rt_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+static inline void
+update_tg_rt_load(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *grt_rq = rt_se->my_q;
+ long delta = grt_rq->avg.load_avg - rt_se->avg.load_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_rt_entity's load */
+ rt_se->avg.load_avg = grt_rq->avg.load_avg;
+ rt_se->avg.load_sum = rt_se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent rt_rq load */
+ add_positive(&rt_rq->avg.load_avg, delta);
+ rt_rq->avg.load_sum = rt_rq->avg.load_avg * LOAD_AVG_MAX;
+}
+
+static inline int test_and_clear_tg_rt_propagate(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *grt_rq = rt_se->my_q;
+
+ if (!grt_rq->propagate_avg)
+ return 0;
+
+ grt_rq->propagate_avg = 0;
+
+ return 1;
+}
+
+static inline void propagate_rt_entity_load_avg(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+ if (entity_is_task(rt_se))
+ return;
+ if (!test_and_clear_tg_rt_propagate(rt_se))
+ return;
+
+ rt_rq->propagate_avg = 1;
+
+ update_tg_rt_util(rt_rq, rt_se);
+ update_tg_rt_load(rt_rq, rt_se);
+}
+#else /* CONFIG_RT_GROUP_SCHED */
+static inline void propagate_rt_entity_load_avg(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+#endif
+
+int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, bool update_freq)
+{
+ int decayed, removed_util = 0;
+ struct sched_avg *sa = &rt_rq->avg;
+ struct rq *rq = rt_rq->rq;
+
+ if (atomic_long_read(&rt_rq->removed_util_avg)) {
+ long r = atomic_long_xchg(&rt_rq->removed_util_avg, 0);
+ sa->util_avg = max_t(long, sa->util_avg - r, 0);
+ sa->util_sum = max_t(s32, sa->util_sum - r * LOAD_AVG_MAX, 0);
+ removed_util = 1;
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Set propagate_avg for task group load propagate */
+ rt_rq->propagate_avg = 1;
+#endif
+ }
+
+ if (atomic_long_read(&rt_rq->removed_load_avg)) {
+ long r = atomic_long_xchg(&rt_rq->removed_load_avg, 0);
+ sa->load_avg = max_t(long, sa->load_avg - r, 0);
+ sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Set propagate_avg for task group load propagate */
+ rt_rq->propagate_avg = 1;
+#endif
+ }
+
+ decayed = __update_load_avg(now, cpu, sa, scale_load_down(NICE_0_LOAD),
+ rt_rq->curr != NULL, NULL);
+
+#ifndef CONFIG_64BIT
+ smp_wmb();
+ rt_rq->load_last_update_time_copy = sa->last_update_time;
+#endif
+
+ if (rt_rq == &rq->rt)
+ trace_sched_rt_load_avg_cpu(cpu_of(rq), rt_rq);
+
+ return decayed;
+}
+
+void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int cpu)
+{
+ /*
+ * Track task load average for carrying it to new CPU after migrated.
+ */
+ if (rt_se->avg.last_update_time)
+ __update_load_avg(now, cpu, &rt_se->avg, scale_load_down(NICE_0_LOAD),
+ rt_rq->curr == rt_se, NULL);
+
+ update_rt_rq_load_avg(now, cpu, rt_rq, true);
+ propagate_rt_entity_load_avg(rt_se, rt_rq);
+
+ if (entity_is_task(rt_se))
+ trace_sched_rt_load_avg_task(rt_task_of(rt_se), &rt_se->avg);
+
+}
+
static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
@@ -387,6 +662,19 @@
}
#else
+static inline void rt_rq_util_change(struct rt_rq *rt_rq)
+{
+}
+
+static inline
+int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, bool update_freq)
+{
+ return 0;
+}
+static inline
+void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se, struct rt_rq *rt_rq, int cpu)
+{
+}
static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
{
@@ -936,6 +1224,10 @@
if (curr->sched_class != &rt_sched_class)
return;
+ /* Kick cpufreq (see the comment in linux/cpufreq.h). */
+ if (cpu_of(rq) == smp_processor_id())
+ cpufreq_trigger_update(rq_clock(rq));
+
delta_exec = rq_clock_task(rq) - curr->se.exec_start;
if (unlikely((s64)delta_exec <= 0))
return;
@@ -1157,12 +1449,67 @@
dec_rt_group(rt_se, rt_rq);
}
+#ifdef CONFIG_SMP
+/**
+ * attach_rt_entity_load_avg - attach this entity to its rt_rq load avg
+ * @rt_rq: rt_rq to attach to
+ * @rt_se: sched_rt_entity to attach
+ *
+ * Must call update_rt_rq_load_avg() before this, since we rely on
+ * rt_rq->avg.last_update_time being current.
+ *
+ * load_{avg,sum} are not used by RT
+ */
+static void
+attach_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ rt_se->avg.last_update_time = rt_rq->avg.last_update_time;
+ rt_rq->avg.util_avg += rt_se->avg.util_avg;
+ rt_rq->avg.util_sum += rt_se->avg.util_sum;
+ rt_rq->avg.load_avg += rt_se->avg.load_avg;
+ rt_rq->avg.load_sum += rt_se->avg.load_sum;
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Set propagate_avg for task group load propagate */
+ rt_rq->propagate_avg = 1;
+#endif
+
+}
+
+/*
+ * detach_entity_load_avg - detach this entity from its rt_rq load avg
+ * @rt_rq: rt_rq to detach from
+ * @rt_se: sched_rt_entity to detach
+ *
+ * Must call update_rt_rq_load_avg() before this, since we rely on
+ * rt_rq->avg.last_update_time being current.
+ *
+ * load_{avg,sum} are not used by RT
+ */
+static void detach_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ rt_rq->avg.util_avg = max_t(long, rt_rq->avg.util_avg - rt_se->avg.util_avg, 0);
+ rt_rq->avg.util_sum = max_t(s32, rt_rq->avg.util_sum - rt_se->avg.util_sum, 0);
+ rt_rq->avg.load_avg = max_t(long, rt_rq->avg.load_avg - rt_se->avg.load_avg, 0);
+ rt_rq->avg.load_sum = max_t(long, rt_rq->avg.load_sum - rt_se->avg.load_sum, 0);
+#ifdef CONFIG_RT_GROUP_SCHED
+ /* Set propagate_avg for task group load propagate */
+ rt_rq->propagate_avg = 1;
+#endif
+}
+#else
+static inline void
+attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+static inline void
+detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) {}
+#endif
+
static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
struct rt_rq *group_rq = group_rt_rq(rt_se);
struct list_head *queue = array->queue + rt_se_prio(rt_se);
+ u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
/*
* Don't enqueue the group if its throttled, or when empty.
@@ -1179,6 +1526,11 @@
list_add_tail(&rt_se->run_list, queue);
__set_bit(rt_se_prio(rt_se), array->bitmap);
+ update_rt_load_avg(now, rt_se, rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ if (rt_entity_is_task(rt_se) && !rt_se->avg.last_update_time)
+ attach_entity_load_avg(&rq_of_rt_se(rt_se)->rt, rt_se);
+
inc_rt_tasks(rt_se, rt_rq);
}
@@ -1186,11 +1538,14 @@
{
struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
struct rt_prio_array *array = &rt_rq->active;
+ u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
list_del_init(&rt_se->run_list);
if (list_empty(array->queue + rt_se_prio(rt_se)))
__clear_bit(rt_se_prio(rt_se), array->bitmap);
+ update_rt_load_avg(now, rt_se, rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
dec_rt_tasks(rt_se, rt_rq);
}
@@ -1251,6 +1606,8 @@
if (flags & ENQUEUE_WAKEUP)
rt_se->timeout = 0;
+ update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 0);
+
enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
@@ -1304,6 +1661,40 @@
#ifdef CONFIG_SMP
static int find_lowest_rq(struct task_struct *task);
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static int
+select_task_rq_rt_fluid(struct task_struct *p, int cpu, int sd_flag, int flags)
+{
+ int target;
+
+ rcu_read_lock();
+ target = find_lowest_rq(p);
+ if (target != -1)
+ cpu = target;
+ rcu_read_unlock();
+
+ return cpu;
+}
+
+static inline void set_victim_flag(struct task_struct *p)
+{
+ p->victim_flag = 1;
+}
+
+static inline void clear_victim_flag(struct task_struct *p)
+{
+ p->victim_flag = 0;
+}
+
+static inline bool test_victim_flag(struct task_struct *p)
+{
+ if (p->victim_flag)
+ return true;
+ else
+ return false;
+}
+
+#else
static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
{
@@ -1360,6 +1751,179 @@
return cpu;
}
+static inline bool test_victim_flag(struct task_struct *p)
+{
+ return false;
+}
+static inline void clear_victim_flag(struct task_struct *p)
+{
+}
+#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_rt(struct sched_rt_entity *rt_se,
+ struct rt_rq *prev, struct rt_rq *next)
+{
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/cfs_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (rt_se->avg.last_update_time && prev) {
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+#ifndef CONFIG_64BIT
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of_rt_rq(prev)),
+ &rt_se->avg, 0, 0, NULL);
+
+ rt_se->avg.last_update_time = n_last_update_time;
+ }
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifndef CONFIG_64BIT
+static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
+{
+ u64 last_update_time_copy;
+ u64 last_update_time;
+
+ do {
+ last_update_time_copy = rt_rq->load_last_update_time_copy;
+ smp_rmb();
+ last_update_time = rt_rq->avg.last_update_time;
+ } while (last_update_time != last_update_time_copy);
+
+ return last_update_time;
+}
+#else
+static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
+{
+ return rt_rq->avg.last_update_time;
+}
+#endif
+
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+static void sync_entity_load_avg(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ u64 last_update_time;
+
+ last_update_time = rt_rq_last_update_time(rt_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of_rt_rq(rt_rq)),
+ &rt_se->avg, 0, 0, NULL);
+}
+
+/*
+ * Task first catches up with rt_rq, and then subtract
+ * itself from the rt_rq (task must be off the queue now).
+ */
+static void remove_entity_load_avg(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+ /*
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * rt_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
+ */
+
+ sync_entity_load_avg(rt_se);
+ atomic_long_add(rt_se->avg.load_avg, &rt_rq->removed_load_avg);
+ atomic_long_add(rt_se->avg.util_avg, &rt_rq->removed_util_avg);
+}
+
+static void attach_task_rt_rq(struct task_struct *p)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
+
+ update_rt_load_avg(now, rt_se, rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ attach_entity_load_avg(rt_rq, rt_se);
+}
+
+static void detach_task_rt_rq(struct task_struct *p)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
+
+ update_rt_load_avg(now, rt_se, rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+ detach_entity_load_avg(rt_rq ,rt_se);
+}
+
+static void migrate_task_rq_rt(struct task_struct *p, int next_cpu)
+{
+ /*
+ * As for fair, we are supposed to update the task to "current" time,
+ * then its up to date and ready to go to new CPU/rt_rq. But we have
+ * difficulty in getting what current time is, so simply throw away the
+ * out-of-date time. This will result in the wakee task is less
+ * decayed, but giving the wakee more load sounds not bad.
+ */
+ remove_entity_load_avg(&p->rt);
+
+ /* Tell new CPU we are migrated */
+ p->rt.avg.last_update_time = 0;
+
+ /* We have migrated, no longer consider this task hot */
+ p->se.exec_start = 0;
+}
+
+static void task_dead_rt(struct task_struct *p)
+{
+ remove_entity_load_avg(&p->rt);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void task_move_group_rt(struct task_struct *p)
+{
+ detach_task_rt_rq(p);
+ set_task_rq(p, task_cpu(p));
+
+#ifdef CONFIG_SMP
+ /* Tell se's rt_rq has been changed -- migrated */
+ p->rt.avg.last_update_time = 0;
+#endif
+ attach_task_rt_rq(p);
+}
+#endif
+
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
/*
@@ -1397,6 +1961,10 @@
if (p->prio < rq->curr->prio) {
resched_curr(rq);
return;
+ } else if (test_victim_flag(p)) {
+ requeue_task_rt(rq, p, 1);
+ resched_curr(rq);
+ return;
}
#ifdef CONFIG_SMP
@@ -1434,19 +2002,52 @@
return next;
}
+#ifdef CONFIG_SMP
+void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se)
+{
+ struct sched_avg *sa = &rt_se->avg;
+
+ sa->last_update_time = 0;
+ /*
+ * sched_avg's period_contrib should be strictly less then 1024, so
+ * we give it 1023 to make sure it is almost a period (1024us), and
+ * will definitely be update (after enqueue).
+ */
+ sa->period_contrib = 1023;
+ /*
+ * Tasks are intialized with zero load.
+ * Load is not actually used by RT.
+ */
+ sa->load_avg = 0;
+ sa->load_sum = 0;
+ /*
+ * At this point, util_avg won't be used in select_task_rq_rt anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
+ /* when this task enqueue'ed, it will contribute to its rt_rq's load_avg */
+}
+#else /* !CONFIG_SMP */
+void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se) { }
+#endif
+
static struct task_struct *_pick_next_task_rt(struct rq *rq)
{
struct sched_rt_entity *rt_se;
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
+ u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
do {
rt_se = pick_next_rt_entity(rq, rt_rq);
BUG_ON(!rt_se);
+ update_rt_load_avg(now, rt_se, rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+ rt_rq->curr = rt_se;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
p = rt_task_of(rt_se);
+
p->se.exec_start = rq_clock_task(rq);
return p;
@@ -1497,11 +2098,20 @@
queue_push_tasks(rq);
+ update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
+ rq->curr->sched_class == &rt_sched_class);
+
+ clear_victim_flag(p);
+
return p;
}
static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
+
update_curr_rt(rq);
/*
@@ -1510,6 +2120,14 @@
*/
if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ if (on_rt_rq(rt_se))
+ update_rt_load_avg(now, rt_se, rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+
+ rt_rq->curr = NULL;
+ }
}
#ifdef CONFIG_SMP
@@ -1547,6 +2165,318 @@
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+unsigned int frt_boost_threshold;
+unsigned int sysctl_sched_restrict_cluster_spill = 0;
+
+static inline int weight_from_rtprio(int prio)
+{
+ int idx = (prio >> 1);
+
+ if ((idx << 1) == prio)
+ return rtprio_to_weight[idx];
+ else
+ return ((rtprio_to_weight[idx] + rtprio_to_weight[idx+1]) >> 1);
+}
+
+/* Check if cpu is in fastest hmp_domain */
+static inline unsigned int hmp_cpu_is_fastest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return pos == hmp_domains.next;
+}
+
+/* Check if cpu is in slowest hmp_domain */
+static inline unsigned int hmp_cpu_is_slowest(int cpu)
+{
+ struct list_head *pos;
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_is_last(pos, &hmp_domains);
+}
+
+
+/* Next (slower) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_slower_domain(int cpu)
+{
+ struct list_head *pos;
+
+ if (hmp_cpu_is_slowest(cpu))
+ return hmp_cpu_domain(cpu);
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+ return list_entry(pos->next, struct hmp_domain, hmp_domains);
+}
+
+/* Previous (faster) hmp_domain relative to cpu */
+static inline struct hmp_domain *hmp_faster_domain(int cpu)
+{
+ struct list_head *pos;
+
+ if (hmp_cpu_is_fastest(cpu))
+ return hmp_cpu_domain(cpu);
+
+ pos = &hmp_cpu_domain(cpu)->hmp_domains;
+
+ return list_entry(pos->prev, struct hmp_domain, hmp_domains);
+}
+
+static int find_victim_rt_rq(struct task_struct *task, struct cpumask *domain_cpu_mask, int *best_cpu)
+{
+ int i;
+ unsigned long victim_rtweight, target_rtweight, min_rtweight;
+ unsigned int victim_cpu_cap, min_cpu_cap = arch_scale_cpu_capacity(NULL, task_cpu(task));
+ bool victim_rt = true;
+
+ target_rtweight = task->rt.avg.util_avg * weight_from_rtprio(task->prio);
+ min_rtweight = target_rtweight;
+
+ for_each_cpu(i, domain_cpu_mask) {
+ struct task_struct *victim = cpu_rq(i)->curr;
+
+ if (victim->nr_cpus_allowed < 2)
+ continue;
+
+ if (!cpumask_test_cpu(i, tsk_cpus_allowed(task)))
+ continue;
+
+ if (rt_task(victim)) {
+ victim_cpu_cap = arch_scale_cpu_capacity(NULL, i);
+ victim_rtweight = victim->rt.avg.util_avg * weight_from_rtprio(victim->prio);
+
+ if (min_cpu_cap == victim_cpu_cap) {
+ if (victim_rtweight < min_rtweight) {
+ min_rtweight = victim_rtweight;
+ *best_cpu = i;
+ min_cpu_cap = victim_cpu_cap;
+ }
+ } else {
+ /*
+ * It's necessary to un-cap the cpu capacity when comparing
+ * utilization of each CPU. This is why the Fluid RT tries to give
+ * the green light on big CPU to the long-run RT task
+ * in accordance with the priority.
+ */
+ if (victim_rtweight * min_cpu_cap < min_rtweight * victim_cpu_cap) {
+ min_rtweight = victim_rtweight;
+ *best_cpu = i;
+ min_cpu_cap = victim_cpu_cap;
+ }
+ }
+ } else {
+ /* If Non-RT CPU is exist, select it first */
+ *best_cpu = i;
+ victim_rt = false;
+ trace_sched_fluid_victim_rt_cpu(task, victim, *best_cpu, "Victim Normal");
+ break;
+ }
+ }
+
+ if (*best_cpu >= 0 && victim_rt) {
+ set_victim_flag(cpu_rq(*best_cpu)->curr);
+ trace_sched_fluid_victim_rt_cpu(task, cpu_rq(*best_cpu)->curr, *best_cpu, "Victim RT Task");
+ }
+
+ return *best_cpu;
+}
+
+static int find_lowest_rq_fluid(struct task_struct *task)
+{
+ struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
+ struct cpumask candidate_mask = CPU_MASK_NONE;
+ struct hmp_domain *hmpd = NULL;
+ struct cpumask *hmp_cpu_mask;
+ int best_cpu = -1;
+ int prev_cpu = task_cpu(task);
+ int this_cpu = smp_processor_id();
+ int boost = false;
+ u64 cpu_load, min_load = ULLONG_MAX;
+ int i;
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!lowest_mask))
+ return best_cpu;
+
+ if (task->nr_cpus_allowed == 1)
+ return best_cpu; /* No other targets possible */
+
+ if (task->rt.avg.util_avg > frt_boost_threshold)
+ boost = true;
+
+ cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask);
+
+ rcu_read_lock();
+
+ if (sysctl_sched_restrict_cluster_spill) {
+ hmpd = hmp_cpu_domain(task_cpu(task));
+ } else {
+ hmpd = boost ? \
+ hmp_faster_domain(task_cpu(task)) : \
+ hmp_slower_domain(task_cpu(task));
+ }
+
+
+ do {
+ hmp_cpu_mask = &hmpd->cpus;
+ min_load = ULLONG_MAX;
+
+ for_each_cpu_and(i, hmp_cpu_mask, lowest_mask) {
+ struct task_struct * curr_task = cpu_rq(i)->curr;
+ struct sched_domain *sd;
+
+ sd = rcu_dereference_check_sched_domain(cpu_rq(i)->sd);
+
+ if (sd->flags & SD_WAKE_AFFINE) {
+ if(cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
+ rcu_read_unlock();
+ return this_cpu;
+ }
+ }
+
+ /* Non-RT runqueue has priority for placement */
+ if (rt_task(curr_task)) {
+ if (curr_task->nr_cpus_allowed > 1)
+ cpumask_set_cpu(i, &candidate_mask);
+ continue;
+ }
+
+ if (!cpumask_test_cpu(i, tsk_cpus_allowed(task)))
+ continue;
+
+ cpu_load = cpu_util(i);
+
+ if (cpu_load < min_load ||
+ (cpu_load == min_load && i == prev_cpu)) {
+ min_load = cpu_load;
+ best_cpu = i;
+ }
+ }
+
+ /* best non-rt cpu is exist? */
+ if (best_cpu != -1) {
+ trace_sched_fluid_select_norm_cpu(task, i, cpu_load, min_load, best_cpu);
+ break;
+ }
+
+ /* Any lower-prio rt cpu is exist ? */
+ if (!cpumask_empty(&candidate_mask)) {
+ best_cpu = cpumask_any(&candidate_mask);
+ trace_sched_fluid_victim_rt_cpu(task, cpu_rq(best_cpu)->curr, best_cpu, "Victim Candiate RT CPU");
+ break;
+ }
+
+ /* Is there available any victim cpu? */
+ if (find_victim_rt_rq(task, hmp_cpu_mask, &best_cpu) != -1)
+ break;
+
+ /*
+ * If cluster restrict or boost is enabled,
+ * We must select cpu of sepecific cluster.
+ */
+ if (sysctl_sched_restrict_cluster_spill) {
+ best_cpu = cpumask_any_and(hmp_cpu_mask,
+ tsk_cpus_allowed(task));
+ trace_sched_fluid_victim_rt_cpu(task, cpu_rq(best_cpu)->curr, best_cpu, "Any CPU on Restrict Cluster");
+ break;
+ }
+
+ if (((hmp_cpu_is_fastest(cpumask_any(hmp_cpu_mask))) && !boost) ||
+ (boost && (hmp_cpu_is_slowest(cpumask_any(hmp_cpu_mask)))))
+ break;
+
+ hmpd = boost ? \
+ hmp_slower_domain(cpumask_any(hmp_cpu_mask)) : \
+ hmp_faster_domain(cpumask_any(hmp_cpu_mask));
+
+ } while (!cpumask_empty(&hmpd->cpus));
+
+ rcu_read_unlock();
+
+ return best_cpu;
+}
+
+static ssize_t show_frt_boost_threshold(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, 10, "%u\n", frt_boost_threshold);
+}
+
+static ssize_t store_frt_boost_threshold(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ int input;
+
+ if (!sscanf(buf, "%d", &input))
+ return -EINVAL;
+
+ input = input < 0 ? 0 : input;
+ input = input > 1024 ? 1024 : input;
+
+ frt_boost_threshold = input;
+
+ return count;
+}
+
+static struct kobj_attribute frt_boost_threshold_attr =
+__ATTR(boost_frt_threshold, 0644, show_frt_boost_threshold,
+ store_frt_boost_threshold);
+#endif
+
+static ssize_t show_switch_rt_load_ratio(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, 10, "%u\n", sched_switch_to_rt_load_ratio);
+}
+
+static ssize_t store_switch_rt_load_ratio(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ int input;
+
+ if (!sscanf(buf, "%d", &input))
+ return -EINVAL;
+
+ input = input < 0 ? 0 : input;
+
+ sched_switch_to_rt_load_ratio = input;
+
+ return count;
+}
+
+static ssize_t show_switch_fair_load_ratio(struct kobject *kobj,
+ struct kobj_attribute *attr, char *buf)
+{
+ return snprintf(buf, 10, "%u\n", sched_switch_to_fair_load_ratio);
+}
+
+static ssize_t store_switch_fair_load_ratio(struct kobject *kobj,
+ struct kobj_attribute *attr, const char *buf,
+ size_t count)
+{
+ int input;
+
+ if (!sscanf(buf, "%d", &input))
+ return -EINVAL;
+
+ input = input < 0 ? 0 : input;
+
+ sched_switch_to_fair_load_ratio = input;
+
+ return count;
+}
+
+static struct kobj_attribute switch_fair_load_ratio_attr =
+__ATTR(switch_fair_load_ratio, 0644, show_switch_fair_load_ratio,
+ store_switch_fair_load_ratio);
+
+static struct kobj_attribute switch_rt_load_ratio_attr =
+__ATTR(switch_rt_load_ratio, 0644, show_switch_rt_load_ratio,
+ store_switch_rt_load_ratio);
+
static int find_lowest_rq(struct task_struct *task)
{
struct sched_domain *sd;
@@ -1554,6 +2484,10 @@
int this_cpu = smp_processor_id();
int cpu = task_cpu(task);
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ return find_lowest_rq_fluid(task);
+#endif
+
/* Make sure the mask is initialized first */
if (unlikely(!lowest_mask))
return -1;
@@ -1658,6 +2592,7 @@
!cpumask_test_cpu(lowest_rq->cpu,
tsk_cpus_allowed(task)) ||
task_running(rq, task) ||
+ !rt_task(task) ||
!task_on_rq_queued(task))) {
double_unlock_balance(rq, lowest_rq);
@@ -1793,172 +2728,160 @@
}
#ifdef HAVE_RT_PUSH_IPI
-
/*
- * When a high priority task schedules out from a CPU and a lower priority
- * task is scheduled in, a check is made to see if there's any RT tasks
- * on other CPUs that are waiting to run because a higher priority RT task
- * is currently running on its CPU. In this case, the CPU with multiple RT
- * tasks queued on it (overloaded) needs to be notified that a CPU has opened
- * up that may be able to run one of its non-running queued RT tasks.
+ * The search for the next cpu always starts at rq->cpu and ends
+ * when we reach rq->cpu again. It will never return rq->cpu.
+ * This returns the next cpu to check, or nr_cpu_ids if the loop
+ * is complete.
*
- * All CPUs with overloaded RT tasks need to be notified as there is currently
- * no way to know which of these CPUs have the highest priority task waiting
- * to run. Instead of trying to take a spinlock on each of these CPUs,
- * which has shown to cause large latency when done on machines with many
- * CPUs, sending an IPI to the CPUs to have them push off the overloaded
- * RT tasks waiting to run.
- *
- * Just sending an IPI to each of the CPUs is also an issue, as on large
- * count CPU machines, this can cause an IPI storm on a CPU, especially
- * if its the only CPU with multiple RT tasks queued, and a large number
- * of CPUs scheduling a lower priority task at the same time.
- *
- * Each root domain has its own irq work function that can iterate over
- * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
- * tassk must be checked if there's one or many CPUs that are lowering
- * their priority, there's a single irq work iterator that will try to
- * push off RT tasks that are waiting to run.
- *
- * When a CPU schedules a lower priority task, it will kick off the
- * irq work iterator that will jump to each CPU with overloaded RT tasks.
- * As it only takes the first CPU that schedules a lower priority task
- * to start the process, the rto_start variable is incremented and if
- * the atomic result is one, then that CPU will try to take the rto_lock.
- * This prevents high contention on the lock as the process handles all
- * CPUs scheduling lower priority tasks.
- *
- * All CPUs that are scheduling a lower priority task will increment the
- * rt_loop_next variable. This will make sure that the irq work iterator
- * checks all RT overloaded CPUs whenever a CPU schedules a new lower
- * priority task, even if the iterator is in the middle of a scan. Incrementing
- * the rt_loop_next will cause the iterator to perform another scan.
- *
+ * rq->rt.push_cpu holds the last cpu returned by this function,
+ * or if this is the first instance, it must hold rq->cpu.
*/
-static int rto_next_cpu(struct root_domain *rd)
+static int rto_next_cpu(struct rq *rq)
{
- int next;
+ int prev_cpu = rq->rt.push_cpu;
int cpu;
+ cpu = cpumask_next(prev_cpu, rq->rd->rto_mask);
+
/*
- * When starting the IPI RT pushing, the rto_cpu is set to -1,
- * rt_next_cpu() will simply return the first CPU found in
- * the rto_mask.
- *
- * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
- * will return the next CPU found in the rto_mask.
- *
- * If there are no more CPUs left in the rto_mask, then a check is made
- * against rto_loop and rto_loop_next. rto_loop is only updated with
- * the rto_lock held, but any CPU may increment the rto_loop_next
- * without any locking.
+ * If the previous cpu is less than the rq's CPU, then it already
+ * passed the end of the mask, and has started from the beginning.
+ * We end if the next CPU is greater or equal to rq's CPU.
*/
- for (;;) {
+ if (prev_cpu < rq->cpu) {
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
- /* When rto_cpu is -1 this acts like cpumask_first() */
- cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
-
- rd->rto_cpu = cpu;
-
- if (cpu < nr_cpu_ids)
- return cpu;
-
- rd->rto_cpu = -1;
-
+ } else if (cpu >= nr_cpu_ids) {
/*
- * ACQUIRE ensures we see the @rto_mask changes
- * made prior to the @next value observed.
- *
- * Matches WMB in rt_set_overload().
+ * We passed the end of the mask, start at the beginning.
+ * If the result is greater or equal to the rq's CPU, then
+ * the loop is finished.
*/
- next = atomic_read_acquire(&rd->rto_loop_next);
-
- if (rd->rto_loop == next)
- break;
-
- rd->rto_loop = next;
+ cpu = cpumask_first(rq->rd->rto_mask);
+ if (cpu >= rq->cpu)
+ return nr_cpu_ids;
}
+ rq->rt.push_cpu = cpu;
- return -1;
+ /* Return cpu to let the caller know if the loop is finished or not */
+ return cpu;
}
-static inline bool rto_start_trylock(atomic_t *v)
+static int find_next_push_cpu(struct rq *rq)
{
- return !atomic_cmpxchg_acquire(v, 0, 1);
+ struct rq *next_rq;
+ int cpu;
+
+ while (1) {
+ cpu = rto_next_cpu(rq);
+ if (cpu >= nr_cpu_ids)
+ break;
+ next_rq = cpu_rq(cpu);
+
+ /* Make sure the next rq can push to this rq */
+ if (next_rq->rt.highest_prio.next < rq->rt.highest_prio.curr)
+ break;
}
-static inline void rto_start_unlock(atomic_t *v)
-{
- atomic_set_release(v, 0);
+ return cpu;
}
+#define RT_PUSH_IPI_EXECUTING 1
+#define RT_PUSH_IPI_RESTART 2
+
static void tell_cpu_to_push(struct rq *rq)
{
- int cpu = -1;
+ int cpu;
- /* Keep the loop going if the IPI is currently active */
- atomic_inc(&rq->rd->rto_loop_next);
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ raw_spin_lock(&rq->rt.push_lock);
+ /* Make sure it's still executing */
+ if (rq->rt.push_flags & RT_PUSH_IPI_EXECUTING) {
+ /*
+ * Tell the IPI to restart the loop as things have
+ * changed since it started.
+ */
+ rq->rt.push_flags |= RT_PUSH_IPI_RESTART;
+ raw_spin_unlock(&rq->rt.push_lock);
+ return;
+ }
+ raw_spin_unlock(&rq->rt.push_lock);
+ }
- /* Only one CPU can initiate a loop at a time */
- if (!rto_start_trylock(&rq->rd->rto_loop_start))
+ /* When here, there's no IPI going around */
+
+ rq->rt.push_cpu = rq->cpu;
+ cpu = find_next_push_cpu(rq);
+ if (cpu >= nr_cpu_ids)
return;
- raw_spin_lock(&rq->rd->rto_lock);
+ rq->rt.push_flags = RT_PUSH_IPI_EXECUTING;
- /*
- * The rto_cpu is updated under the lock, if it has a valid cpu
- * then the IPI is still running and will continue due to the
- * update to loop_next, and nothing needs to be done here.
- * Otherwise it is finishing up and an ipi needs to be sent.
- */
- if (rq->rd->rto_cpu < 0)
- cpu = rto_next_cpu(rq->rd);
-
- raw_spin_unlock(&rq->rd->rto_lock);
-
- rto_start_unlock(&rq->rd->rto_loop_start);
-
- if (cpu >= 0) {
- /* Make sure the rd does not get freed while pushing */
- sched_get_rd(rq->rd);
- irq_work_queue_on(&rq->rd->rto_push_work, cpu);
- }
+ irq_work_queue_on(&rq->rt.push_work, cpu);
}
/* Called from hardirq context */
-void rto_push_irq_work_func(struct irq_work *work)
+static void try_to_push_tasks(void *arg)
{
- struct root_domain *rd =
- container_of(work, struct root_domain, rto_push_work);
- struct rq *rq;
+ struct rt_rq *rt_rq = arg;
+ struct rq *rq, *src_rq;
+ int this_cpu;
int cpu;
- rq = this_rq();
+ this_cpu = rt_rq->push_cpu;
- /*
- * We do not need to grab the lock to check for has_pushable_tasks.
- * When it gets updated, a check is made if a push is possible.
- */
+ /* Paranoid check */
+ BUG_ON(this_cpu != smp_processor_id());
+
+ rq = cpu_rq(this_cpu);
+ src_rq = rq_of_rt_rq(rt_rq);
+
+again:
if (has_pushable_tasks(rq)) {
raw_spin_lock(&rq->lock);
- push_rt_tasks(rq);
+ push_rt_task(rq);
raw_spin_unlock(&rq->lock);
}
- raw_spin_lock(&rd->rto_lock);
-
/* Pass the IPI to the next rt overloaded queue */
- cpu = rto_next_cpu(rd);
-
- raw_spin_unlock(&rd->rto_lock);
-
- if (cpu < 0) {
- sched_put_rd(rd);
- return;
+ raw_spin_lock(&rt_rq->push_lock);
+ /*
+ * If the source queue changed since the IPI went out,
+ * we need to restart the search from that CPU again.
+ */
+ if (rt_rq->push_flags & RT_PUSH_IPI_RESTART) {
+ rt_rq->push_flags &= ~RT_PUSH_IPI_RESTART;
+ rt_rq->push_cpu = src_rq->cpu;
}
+ cpu = find_next_push_cpu(src_rq);
+
+ if (cpu >= nr_cpu_ids)
+ rt_rq->push_flags &= ~RT_PUSH_IPI_EXECUTING;
+ raw_spin_unlock(&rt_rq->push_lock);
+
+ if (cpu >= nr_cpu_ids)
+ return;
+
+ /*
+ * It is possible that a restart caused this CPU to be
+ * chosen again. Don't bother with an IPI, just see if we
+ * have more to push.
+ */
+ if (unlikely(cpu == rq->cpu))
+ goto again;
+
/* Try the next RT overloaded CPU */
- irq_work_queue_on(&rd->rto_push_work, cpu);
+ irq_work_queue_on(&rt_rq->push_work, cpu);
+}
+
+static void push_irq_work_func(struct irq_work *work)
+{
+ struct rt_rq *rt_rq = container_of(work, struct rt_rq, push_work);
+
+ try_to_push_tasks(rt_rq);
}
#endif /* HAVE_RT_PUSH_IPI */
@@ -1968,9 +2891,8 @@
bool resched = false;
struct task_struct *p;
struct rq *src_rq;
- int rt_overload_count = rt_overloaded(this_rq);
- if (likely(!rt_overload_count))
+ if (likely(!rt_overloaded(this_rq)))
return;
/*
@@ -1979,11 +2901,6 @@
*/
smp_rmb();
- /* If we are the only overloaded CPU do nothing */
- if (rt_overload_count == 1 &&
- cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
- return;
-
#ifdef HAVE_RT_PUSH_IPI
if (sched_feat(RT_PUSH_IPI)) {
tell_cpu_to_push(this_rq);
@@ -2103,6 +3020,7 @@
*/
static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
+ detach_task_rt_rq(p);
/*
* If there are other RT tasks then we will reschedule
* and the scheduling of the other RT tasks will handle
@@ -2124,9 +3042,17 @@
zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
GFP_KERNEL, cpu_to_node(i));
}
+
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ frt_boost_threshold = 0;
+#endif
+ sched_switch_to_rt_load_ratio = 0;
+ sched_switch_to_fair_load_ratio = 0;
}
#endif /* CONFIG_SMP */
+extern
+void copy_sched_avg(struct sched_avg *from, struct sched_avg *to, unsigned int ratio);
/*
* When switching a task to RT, we may overload the runqueue
* with RT tasks. In this case we try to push them off to
@@ -2134,6 +3060,7 @@
*/
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
+ copy_sched_avg(&p->se.avg, &p->rt.avg, sched_switch_to_rt_load_ratio);
/*
* If we are already running, then there's nothing
* that needs to be done. But if we are not running
@@ -2145,9 +3072,10 @@
#ifdef CONFIG_SMP
if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
queue_push_tasks(rq);
-#endif /* CONFIG_SMP */
+#else
if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
resched_curr(rq);
+#endif /* CONFIG_SMP */
}
}
@@ -2217,9 +3145,16 @@
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
struct sched_rt_entity *rt_se = &p->rt;
+ u64 now = rq_clock_task(rq);
update_curr_rt(rq);
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+ update_rt_load_avg(now, rt_se, rt_rq, cpu_of(rq_of_rt_rq(rt_rq)));
+ }
+
watchdog(rq, p);
/*
@@ -2250,9 +3185,16 @@
static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
+ struct sched_rt_entity *rt_se = &p->rt;
p->se.exec_start = rq_clock_task(rq);
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+ rt_rq->curr = rt_se;
+ }
+
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
}
@@ -2280,13 +3222,19 @@
.put_prev_task = put_prev_task_rt,
#ifdef CONFIG_SMP
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ .select_task_rq = select_task_rq_rt_fluid,
+#else
.select_task_rq = select_task_rq_rt,
+#endif
+ .migrate_task_rq = migrate_task_rq_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.task_woken = task_woken_rt,
.switched_from = switched_from_rt,
+ .task_dead = task_dead_rt,
#endif
.set_curr_task = set_curr_task_rt,
@@ -2298,6 +3246,9 @@
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
+#ifdef CONFIG_RT_GROUP_SCHED
+ .task_move_group = task_move_group_rt,
+#endif
};
#ifdef CONFIG_SCHED_DEBUG
@@ -2314,3 +3265,32 @@
rcu_read_unlock();
}
#endif /* CONFIG_SCHED_DEBUG */
+
+/**********************************************************************
+ * Sysfs *
+ **********************************************************************/
+static struct attribute *ert_attrs[] = {
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ &frt_boost_threshold_attr.attr,
+#endif
+ &switch_fair_load_ratio_attr.attr,
+ &switch_rt_load_ratio_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group ert_group = {
+ .attrs = ert_attrs,
+};
+
+static struct kobject *ert_kobj;
+
+static int init_sysfs(void)
+{
+ int ret;
+
+ ert_kobj = kobject_create_and_add("ert", kernel_kobj);
+ ret = sysfs_create_group(ert_kobj, &ert_group);
+
+ return 0;
+}
+late_initcall(init_sysfs);
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
old mode 100644
new mode 100755
index 6893ee3..1f63cb0
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -14,6 +14,8 @@
#include "cpudeadline.h"
#include "cpuacct.h"
+#define MOVETASK_ONEPATH
+
struct rq;
struct cpuidle_state;
@@ -337,7 +339,25 @@
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
-#endif
+
+#ifdef CONFIG_RT_GROUP_SCHED
+#ifdef CONFIG_SMP
+extern void set_task_rq_rt(struct sched_rt_entity *rt_se,
+ struct rt_rq *prev, struct rt_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_rt(struct sched_rt_entity *rt_se,
+ struct rt_rq *prev, struct rt_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifdef CONFIG_SMP
+extern void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next);
+#else /* !CONFIG_SMP */
+static inline void set_task_rq_fair(struct sched_entity *se,
+ struct cfs_rq *prev, struct cfs_rq *next) { }
+#endif /* CONFIG_SMP */
+#endif /* CONFIG_FAIR_GROUP_SCHED */
#else /* CONFIG_CGROUP_SCHED */
@@ -380,6 +400,11 @@
unsigned long tg_load_avg_contrib;
#endif
atomic_long_t removed_load_avg, removed_util_avg;
+
+#ifdef CONFIG_SCHED_HMP
+ unsigned long sysload_avg_ratio;
+#endif
+
#ifndef CONFIG_64BIT
u64 load_last_update_time_copy;
#endif
@@ -419,7 +444,7 @@
u64 throttled_clock, throttled_clock_task;
u64 throttled_clock_task_time;
- int throttled, throttle_count, throttle_uptodate;
+ int throttled, throttle_count;
struct list_head throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
@@ -452,6 +477,17 @@
unsigned long rt_nr_total;
int overloaded;
struct plist_head pushable_tasks;
+ struct sched_avg avg;
+ struct sched_rt_entity *curr;
+ atomic_long_t removed_util_avg;
+ atomic_long_t removed_load_avg;
+
+#ifdef HAVE_RT_PUSH_IPI
+ int push_flags;
+ int push_cpu;
+ struct irq_work push_work;
+ raw_spinlock_t push_lock;
+#endif
#endif /* CONFIG_SMP */
int rt_queued;
@@ -466,6 +502,10 @@
struct rq *rq;
struct task_group *tg;
+ unsigned long propagate_avg;
+#ifndef CONFIG_64BIT
+ u64 load_last_update_time_copy;
+#endif
#endif
};
@@ -502,6 +542,8 @@
#else
struct dl_bw dl_bw;
#endif
+ /* This is the "average utilization" for this runqueue */
+ s64 avg_bw;
};
#ifdef CONFIG_SMP
@@ -533,19 +575,6 @@
struct dl_bw dl_bw;
struct cpudl cpudl;
-#ifdef HAVE_RT_PUSH_IPI
- /*
- * For IPI pull requests, loop across the rto_mask.
- */
- struct irq_work rto_push_work;
- raw_spinlock_t rto_lock;
- /* These are only updated and read within rto_lock */
- int rto_loop;
- int rto_cpu;
- /* These atomics are updated outside of a lock */
- atomic_t rto_loop_next;
- atomic_t rto_loop_start;
-#endif
/*
* The "RT overload" flag: it gets set if a CPU has more than
* one runnable RT task.
@@ -558,9 +587,6 @@
extern void sched_get_rd(struct root_domain *rd);
extern void sched_put_rd(struct root_domain *rd);
-#ifdef HAVE_RT_PUSH_IPI
-extern void rto_push_irq_work_func(struct irq_work *work);
-#endif
#endif /* CONFIG_SMP */
/*
@@ -639,6 +665,11 @@
int active_balance;
int push_cpu;
struct cpu_stop_work active_balance_work;
+#ifdef CONFIG_SCHED_HMP
+ struct task_struct *migrate_task;
+ u64 hmp_last_up_migration;
+ u64 hmp_last_down_migration;
+#endif
/* cpu of this runqueue: */
int cpu;
int online;
@@ -910,6 +941,11 @@
extern int group_balance_cpu(struct sched_group *sg);
+#ifdef CONFIG_SCHED_HMP
+extern struct list_head hmp_domains;
+DECLARE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
+#define hmp_cpu_domain(cpu) (per_cpu(hmp_cpu_domain, (cpu)))
+#endif /* CONFIG_SCHED_HMP */
#else
static inline void sched_ttwu_pending(void) { }
@@ -947,11 +983,13 @@
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
+ set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]);
p->se.cfs_rq = tg->cfs_rq[cpu];
p->se.parent = tg->se[cpu];
#endif
#ifdef CONFIG_RT_GROUP_SCHED
+ set_task_rq_rt(&p->rt, p->rt.rt_rq, tg->rt_rq[cpu]);
p->rt.rt_rq = tg->rt_rq[cpu];
p->rt.parent = tg->rt_se[cpu];
#endif
@@ -977,7 +1015,11 @@
* per-task data have been completed by this moment.
*/
smp_wmb();
+#ifdef CONFIG_THREAD_INFO_IN_TASK
+ p->cpu = cpu;
+#else
task_thread_info(p)->cpu = cpu;
+#endif
p->wake_cpu = cpu;
#endif
}
@@ -1036,6 +1078,16 @@
return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
}
+#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
+
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+#ifdef CONFIG_SCHED_DEBUG
+ WARN_ON_ONCE(!rt_entity_is_task(rt_se));
+#endif
+ return container_of(rt_se, struct task_struct, rt);
+}
+
static inline int task_current(struct rq *rq, struct task_struct *p)
{
return rq->curr == p;
@@ -1150,6 +1202,24 @@
/* 15 */ 36, 29, 23, 18, 15,
};
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+/*
+ * RT Extension for 'prio_to_weight'
+ */
+static const int rtprio_to_weight[51] = {
+ /* 0 */ 17222521, 15500269, 13950242, 12555218, 11299696,
+ /* 10 */ 10169726, 9152754, 8237478, 7413730, 6672357,
+ /* 20 */ 6005122, 5404609, 4864149, 4377734, 3939960,
+ /* 30 */ 3545964, 3191368, 2872231, 2585008, 2326507,
+ /* 40 */ 2093856, 1884471, 1696024, 1526421, 1373779,
+ /* 50 */ 1236401, 1112761, 1001485, 901337, 811203,
+ /* 60 */ 730083, 657074, 591367, 532230, 479007,
+ /* 70 */ 431106, 387996, 349196, 314277, 282849,
+ /* 80 */ 254564, 229108, 206197, 185577, 167019,
+ /* 90 */ 150318, 135286, 121757, 109581, 98623,
+ /* 100 for Fair class */ 88761,
+};
+#endif
/*
* Inverse (2^32/x) values of the prio_to_weight[] array, precalculated.
*
@@ -1207,7 +1277,7 @@
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
- void (*migrate_task_rq)(struct task_struct *p);
+ void (*migrate_task_rq)(struct task_struct *p, int next_cpu);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1264,7 +1334,7 @@
extern void update_group_capacity(struct sched_domain *sd, int cpu);
-extern void trigger_load_balance(struct rq *rq);
+extern void trigger_load_balance(struct rq *rq, int cpu);
extern void idle_enter_fair(struct rq *this_rq);
extern void idle_exit_fair(struct rq *this_rq);
@@ -1315,6 +1385,7 @@
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern void init_rt_schedtune_timer(struct sched_rt_entity *rt_se);
extern struct dl_bandwidth def_dl_bandwidth;
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
@@ -1323,6 +1394,7 @@
unsigned long to_ratio(u64 period, u64 runtime);
extern void init_entity_runnable_average(struct sched_entity *se);
+extern void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se);
static inline void add_nr_running(struct rq *rq, unsigned count)
{
@@ -1411,11 +1483,20 @@
extern void sched_avg_update(struct rq *rq);
#ifndef arch_scale_freq_capacity
+
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+unsigned long exynos_scale_freq_capacity(struct sched_domain *sd, int cpu);
+#endif
static __always_inline
unsigned long arch_scale_freq_capacity(struct sched_domain *sd, int cpu)
{
+#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
+ return exynos_scale_freq_capacity(sd, cpu);
+#else
return SCHED_CAPACITY_SCALE;
+#endif
}
+#define arch_scale_freq_invariant() (true)
#endif
#ifndef arch_scale_cpu_capacity
@@ -1593,7 +1674,7 @@
static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
__releases(busiest->lock)
{
- raw_spin_unlock(&busiest->lock);
+ raw_spin_unlock(&busiest->lock);
lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
}
@@ -1784,3 +1865,117 @@
}
#endif /* CONFIG_64BIT */
#endif /* CONFIG_IRQ_TIME_ACCOUNTING */
+
+#ifdef CONFIG_CPU_FREQ
+
+/**
+ * Default limit transition rate.
+ */
+#define DEFAULT_LATENCY_MULTIPLIER 50
+
+DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data);
+
+/**
+ * cpufreq_update_util - Take a note about CPU utilization changes.
+ * @time: Current time.
+ * @util: Current utilization.
+ * @max: Utilization ceiling.
+ *
+ * This function is called by the scheduler on every invocation of
+ * update_load_avg() on the CPU whose utilization is being updated.
+ *
+ * It can only be called from RCU-sched read-side critical sections.
+ */
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max)
+{
+ struct update_util_data *data;
+
+ data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data));
+ if (data)
+ data->func(data, time, util, max);
+}
+
+#ifdef CONFIG_CPU_FREQ_SCHEDUTIL_PERFSTAT_TRIGGER
+/**
+ * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed.
+ * @time: Current time.
+ *
+ * The way cpufreq is currently arranged requires it to evaluate the CPU
+ * performance state (frequency/voltage) on a regular basis to prevent it from
+ * being stuck in a completely inadequate performance level for too long.
+ * That is not guaranteed to happen if the updates are only triggered from CFS,
+ * though, because they may not be coming in if RT or deadline tasks are active
+ * all the time (or there are RT and DL tasks only).
+ *
+ * As a workaround for that issue, this function is called by the RT and DL
+ * sched classes to trigger extra cpufreq updates to prevent it from stalling,
+ * but that really is a band-aid. Going forward it should be replaced with
+ * solutions targeted more specifically at RT and DL tasks.
+ */
+static inline void cpufreq_trigger_update(u64 time)
+{
+ cpufreq_update_util(time, ULONG_MAX, 0);
+}
+#else
+static inline void cpufreq_trigger_update(u64 time) {}
+#endif
+#else
+static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {}
+static inline void cpufreq_trigger_update(u64 time) {}
+#endif /* CONFIG_CPU_FREQ */
+
+static inline void account_reset_rq(struct rq *rq)
+{
+#ifdef CONFIG_IRQ_TIME_ACCOUNTING
+ rq->prev_irq_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT
+ rq->prev_steal_time = 0;
+#endif
+#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
+ rq->prev_steal_time_rq = 0;
+#endif
+}
+
+#ifdef CONFIG_SMP
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static unsigned long capacity_orig_of(int cpu)
+{
+ return cpu_rq(cpu)->cpu_capacity_orig;
+}
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long cpu_util(int cpu)
+{
+ unsigned long util;
+ unsigned long capacity = capacity_orig_of(cpu);
+
+ util = cpu_rq(cpu)->cfs.avg.util_avg + cpu_rq(cpu)->rt.avg.util_avg;
+ return (util >= capacity) ? capacity : util;
+}
+#endif
+#endif
diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c
old mode 100644
new mode 100755
index 87e2c9f..ad3c985
--- a/kernel/sched/stats.c
+++ b/kernel/sched/stats.c
@@ -12,6 +12,30 @@
*/
#define SCHEDSTAT_VERSION 15
+#ifdef DEFAULT_USE_ENERGY_AWARE
+static inline void show_easstat(struct seq_file *seq, struct eas_stats *stats)
+{
+ /* eas-specific runqueue stats */
+ seq_printf(seq, "eas %llu %llu %llu %llu %llu %llu ",
+ stats->sis_attempts, stats->sis_idle, stats->sis_cache_affine,
+ stats->sis_suff_cap, stats->sis_idle_cpu, stats->sis_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu %llu %llu ",
+ stats->secb_attempts, stats->secb_sync, stats->secb_idle_bt,
+ stats->secb_insuff_cap, stats->secb_no_nrg_sav,
+ stats->secb_nrg_sav, stats->secb_count);
+
+ seq_printf(seq, "%llu %llu %llu %llu %llu ",
+ stats->fbt_attempts, stats->fbt_no_cpu, stats->fbt_no_sd,
+ stats->fbt_pref_idle, stats->fbt_count);
+
+ seq_printf(seq, "%llu %llu\n",
+ stats->cas_attempts, stats->cas_count);
+}
+#else
+static void show_easstat(struct seq_file *seq, struct eas_stats *stats) { }
+#endif
+
static int show_schedstat(struct seq_file *seq, void *v)
{
int cpu;
@@ -40,6 +64,9 @@
seq_printf(seq, "\n");
#ifdef CONFIG_SMP
+#ifdef DEFAULT_USE_ENERGY_AWARE
+ show_easstat(seq, &rq->eas_stats);
+#endif
/* domain-specific stats */
rcu_read_lock();
for_each_domain(cpu, sd) {
@@ -66,6 +93,8 @@
sd->sbf_count, sd->sbf_balanced, sd->sbf_pushed,
sd->ttwu_wake_remote, sd->ttwu_move_affine,
sd->ttwu_move_balance);
+
+ show_easstat(seq, &sd->eas_stats);
}
rcu_read_unlock();
#endif
diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h
old mode 100644
new mode 100755
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
old mode 100644
new mode 100755
index cbc67da..61f852d
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,4 +1,5 @@
#include "sched.h"
+#include "walt.h"
/*
* stop-task scheduling class.
@@ -42,12 +43,14 @@
enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
add_nr_running(rq, 1);
+ walt_inc_cumulative_runnable_avg(rq, p);
}
static void
dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
{
sub_nr_running(rq, 1);
+ walt_dec_cumulative_runnable_avg(rq, p);
}
static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c
new file mode 100755
index 0000000..5267c5a
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,687 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+#include "sched.h"
+#include "tune.h"
+
+unsigned int sysctl_sched_cfs_boost __read_mostly;
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+/*
+ * EAS scheduler tunables for task groups.
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+ /* SchedTune CGroup subsystem */
+ struct cgroup_subsys_state css;
+
+ /* Boost group allocated ID */
+ int idx;
+
+ /* Boost value for tasks on that SchedTune CGroup */
+ int boost;
+
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+ return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+ return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boost tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to tasks which are not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+ .boost = 0,
+};
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allow only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ * make sense to boost with different values (e.g. background vs foreground
+ * tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ * implementation especially for the computation of the per-CPU boost
+ * value
+ */
+#define BOOSTGROUPS_COUNT 4
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+ &root_schedtune,
+ NULL,
+};
+
+/* SchedTune boost groups
+ * Keep track of all the boost groups which impact on CPU, for example when a
+ * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
+ * likely with different boost values.
+ * Since on each system we expect only a limited number of boost groups, here
+ * we use a simple array to keep track of the metrics required to compute the
+ * maximum per-CPU boosting value.
+ */
+struct boost_groups {
+ /* Maximum boost value for all RUNNABLE tasks on a CPU */
+ unsigned boost_max;
+ struct {
+ /* The boost for tasks on that boost group */
+ unsigned boost;
+ /* Count of RUNNABLE tasks on that boost group */
+ unsigned tasks;
+ } group[BOOSTGROUPS_COUNT];
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static void
+schedtune_cpu_update(int cpu)
+{
+ struct boost_groups *bg;
+ unsigned boost_max;
+ int idx;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* The root boost group is always active */
+ boost_max = bg->group[0].boost;
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+ /*
+ * A boost group affects a CPU only if it has
+ * RUNNABLE tasks on that CPU
+ */
+ if (bg->group[idx].tasks == 0)
+ continue;
+ boost_max = max(boost_max, bg->group[idx].boost);
+ }
+
+ bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+ struct boost_groups *bg;
+ int cur_boost_max;
+ int old_boost;
+ int cpu;
+
+ /* Update per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /*
+ * Keep track of current boost values to compute the per CPU
+ * maximum only when it has been affected by the new value of
+ * the updated boost group
+ */
+ cur_boost_max = bg->boost_max;
+ old_boost = bg->group[idx].boost;
+
+ /* Update the boost value of this boost group */
+ bg->group[idx].boost = boost;
+
+ /* Check if this update increase current max */
+ if (boost > cur_boost_max && bg->group[idx].tasks) {
+ bg->boost_max = boost;
+ continue;
+ }
+
+ /* Check if this update has decreased current max */
+ if (cur_boost_max == old_boost && old_boost > boost)
+ schedtune_cpu_update(cpu);
+ }
+
+ return 0;
+}
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+ struct boost_groups *bg;
+ int tasks;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+
+ /* Update boosted tasks count while avoiding to make it negative */
+ if (task_count < 0 && bg->group[idx].tasks <= -task_count)
+ bg->group[idx].tasks = 0;
+ else
+ bg->group[idx].tasks += task_count;
+
+ /* Boost group activation or deactivation on that RQ */
+ tasks = bg->group[idx].tasks;
+ if (tasks == 1 || tasks == 0)
+ schedtune_cpu_update(cpu);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+ struct schedtune *st;
+ int idx;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /* Get task boost group */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ idx = st->idx;
+ rcu_read_unlock();
+
+ schedtune_tasks_update(p, cpu, idx, 1);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+ struct schedtune *st;
+ int idx;
+
+ /*
+ * When a task is marked PF_EXITING by do_exit() it's going to be
+ * dequeued and enqueued multiple times in the exit path.
+ * Thus we avoid any further update, since we do not want to change
+ * CPU boosting while the task is exiting.
+ * The last dequeue will be done by cgroup exit() callback.
+ */
+ if (p->flags & PF_EXITING)
+ return;
+
+ /* Get task boost group */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ idx = st->idx;
+ rcu_read_unlock();
+
+ schedtune_tasks_update(p, cpu, idx, -1);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+ struct schedtune *st;
+ unsigned long irq_flags;
+ unsigned int cpu;
+ struct rq *rq;
+ int idx;
+
+ if (!unlikely(schedtune_initialized))
+ return;
+
+ rq = lock_rq_of(tsk, &irq_flags);
+ rcu_read_lock();
+
+ cpu = cpu_of(rq);
+ st = task_schedtune(tsk);
+ idx = st->idx;
+ schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+ rcu_read_unlock();
+ unlock_rq_of(rq, tsk, &irq_flags);
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+ struct boost_groups *bg;
+
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ return bg->boost_max;
+}
+
+int schedtune_task_boost(struct task_struct *p)
+{
+ struct schedtune *st;
+ int task_boost;
+
+ /* Get task boost value */
+ rcu_read_lock();
+ st = task_schedtune(p);
+ task_boost = st->boost;
+ rcu_read_unlock();
+
+ return task_boost;
+}
+
+static u64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+ struct schedtune *st = css_st(css);
+
+ return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+ u64 boost)
+{
+ struct schedtune *st = css_st(css);
+
+ if (boost < 0 || boost > 100)
+ return -EINVAL;
+
+ st->boost = boost;
+ if (css == &root_schedtune.css)
+ sysctl_sched_cfs_boost = boost;
+
+ /* Update CPU boost */
+ schedtune_boostgroup_update(st->idx, st->boost);
+
+ return 0;
+}
+
+static struct cftype files[] = {
+ {
+ .name = "boost",
+ .read_u64 = boost_read,
+ .write_u64 = boost_write,
+ },
+ { } /* terminate */
+};
+
+static int
+schedtune_boostgroup_init(struct schedtune *st)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Keep track of allocated boost groups */
+ allocated_group[st->idx] = st;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ bg->group[st->idx].boost = 0;
+ bg->group[st->idx].tasks = 0;
+ }
+
+ return 0;
+}
+
+static int
+schedtune_init(void)
+{
+ struct boost_groups *bg;
+ int cpu;
+
+ /* Initialize the per CPU boost groups */
+ for_each_possible_cpu(cpu) {
+ bg = &per_cpu(cpu_boost_groups, cpu);
+ memset(bg, 0, sizeof(struct boost_groups));
+ }
+
+ pr_info(" schedtune configured to support %d boost groups\n",
+ BOOSTGROUPS_COUNT);
+ return 0;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+ struct schedtune *st;
+ int idx;
+
+ if (!parent_css) {
+ schedtune_init();
+ return &root_schedtune.css;
+ }
+
+ /* Allow only single level hierachies */
+ if (parent_css != &root_schedtune.css) {
+ pr_err("Nested SchedTune boosting groups not allowed\n");
+ return ERR_PTR(-ENOMEM);
+ }
+
+ /* Allow only a limited number of boosting groups */
+ for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+ if (!allocated_group[idx])
+ break;
+ if (idx == BOOSTGROUPS_COUNT) {
+ pr_err("Trying to create more than %d SchedTune boosting groups\n",
+ BOOSTGROUPS_COUNT);
+ return ERR_PTR(-ENOSPC);
+ }
+
+ st = kzalloc(sizeof(*st), GFP_KERNEL);
+ if (!st)
+ goto out;
+
+ /* Initialize per CPUs boost group support */
+ st->idx = idx;
+ if (schedtune_boostgroup_init(st))
+ goto release;
+
+ return &st->css;
+
+release:
+ kfree(st);
+out:
+ return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+ /* Reset this boost group */
+ schedtune_boostgroup_update(st->idx, 0);
+
+ /* Keep track of allocated boost groups */
+ allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+ struct schedtune *st = css_st(css);
+
+ schedtune_boostgroup_release(st);
+ kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+ .css_alloc = schedtune_css_alloc,
+ .css_free = schedtune_css_free,
+ .legacy_cftypes = files,
+ .early_init = 1,
+};
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+#ifdef CONFIG_FREQVAR_SCHEDTUNE
+static struct freqvar_boost_state freqvar_boost_state[CONFIG_NR_CPUS];
+
+int schedtune_freqvar_boost(int cpu)
+{
+ if (!freqvar_boost_state[cpu].enabled)
+ return 0;
+
+ return freqvar_boost_state[cpu].ratio;
+}
+
+/* update freqvar_boost ratio matched current frequency */
+static void schedtune_freqvar_update_boost_ratio(int cpu, int new_freq)
+{
+ struct freqvar_boost_table *pos = freqvar_boost_state[cpu].table;
+
+ for (; pos->frequency != CPUFREQ_TABLE_END; pos++)
+ if (new_freq == pos->frequency) {
+ freqvar_boost_state[cpu].ratio = pos->boost;
+ break;
+ }
+
+ return;
+}
+
+/* when cpu frequency scaled, this callback called on each cpu */
+static int schedtune_freqvar_cpufreq_callback(struct notifier_block *nb,
+ unsigned long val, void *data)
+{
+ struct cpufreq_freqs *freq = data;
+
+ if (freq->flags & CPUFREQ_CONST_LOOPS)
+ return NOTIFY_OK;
+
+ if (val != CPUFREQ_POSTCHANGE)
+ return NOTIFY_OK;
+
+ if (freqvar_boost_state[freq->cpu].enabled)
+ schedtune_freqvar_update_boost_ratio(freq->cpu, freq->new);
+
+ return 0;
+}
+
+static int schedtune_freqvar_find_node(struct device_node **dn,
+ struct cpufreq_policy *policy)
+{
+ const char *buf;
+ cpumask_t shared_mask;
+ int ret;
+
+ while ((*dn = of_find_node_by_type(*dn, "schedtune-freqvar"))) {
+ /*
+ * shared-cpus includes cpus scaling at the sametime.
+ * it is called "sibling cpus" in the CPUFreq and
+ * masked on the realated_cpus of the policy
+ */
+ ret = of_property_read_string(*dn, "shared-cpus", &buf);
+ if (ret)
+ return ret;
+ cpumask_clear(&shared_mask);
+ cpulist_parse(buf, &shared_mask);
+ cpumask_and(&shared_mask, &shared_mask, cpu_possible_mask);
+ if (cpumask_weight(&shared_mask) == 0)
+ return -ENODEV;
+ if (cpumask_equal(&shared_mask, policy->related_cpus)) {
+ return 0;
+ }
+ }
+
+ return -EINVAL;
+}
+
+/*
+ * update freqvar_boost table from src table .
+ * src table is array of frequency and ratio and has to use ascending order.
+ * src table examples: 12 546000 10 650000 8 858000 4 1274000 0
+ * dst table examples:
+ * Freq Ratio
+ * 1274000 0
+ * 1170000 4
+ * 1066000 4
+ * 962000 4
+ * 858000 4
+ * 754000 8
+ * 650000 8
+ * 546000 10
+ * 442000 12
+ * ratio unit is 1%.
+ */
+int schedtune_freqvar_update_table(unsigned int *src, int src_size,
+ struct freqvar_boost_table *dst)
+{
+ struct freqvar_boost_table *pos, *last_pos = dst;
+ unsigned int ratio = 0, freq = 0;
+ int i;
+
+ for (i = src_size - 1; i >= 0; i--) {
+ ratio = src[i] * SCHEDTUNE_LOAD_BOOST_UTIT;
+ freq = (i - 1 < 0) ? 0 : src[i - 1];
+
+ for (pos = last_pos; pos->frequency != CPUFREQ_TABLE_END; pos++)
+ if (pos->frequency >= freq) {
+ pos->boost = ratio;
+ } else {
+ last_pos = pos;
+ break;
+ }
+ }
+
+ return 0;
+}
+
+static int schedtune_freqvar_parse_dt(struct device_node *dn,
+ struct freqvar_boost_data *data)
+{
+ int size;
+ unsigned int *table;
+
+ /* get the boost table from dts files */
+ size = of_property_count_u32_elems(dn, "table");
+ table = kzalloc(sizeof(unsigned int) * size, GFP_KERNEL);
+ of_property_read_u32_array(dn, "table", (unsigned int *)table, size);
+ if (!table)
+ return -ENOMEM;
+
+ /* update freqvar_boost table from dt */
+ schedtune_freqvar_update_table(table, size, data->table);
+
+ kfree(table);
+
+ return 0;
+}
+
+static int schedtune_freqvar_init_table(struct cpufreq_policy *policy,
+ struct freqvar_boost_data *data)
+{
+ struct cpufreq_frequency_table *cpufreq_table, *pos;
+ struct freqvar_boost_table *boost_table;
+ int size = 0, index;
+
+ cpufreq_table = cpufreq_frequency_get_table(policy->cpu);
+ if (unlikely(!cpufreq_table)) {
+ pr_debug("%s: Unable to find frequency table\n", __func__);
+ return -ENOENT;
+ }
+
+ /*
+ * HACK
+ * get max index of cpufreq table
+ * need to more smart and simple way
+ */
+ cpufreq_for_each_valid_entry(pos, cpufreq_table)
+ size++;
+
+ /*
+ * freqvar_boost table is allocated with POLICY INIT
+ * It is deallocated with POLICY EXIT
+ */
+ boost_table = kzalloc(sizeof(struct freqvar_boost_table)
+ * (size + 1), GFP_KERNEL);
+ if (boost_table == NULL) {
+ pr_err("%s: failed to allocate memory\n", __func__);
+ return -ENOMEM;
+ }
+
+ /* copy cpu frequency table */
+ index = 0;
+ cpufreq_for_each_valid_entry(pos, cpufreq_table) {
+ boost_table[index].frequency = pos->frequency;
+ boost_table[index].boost = 0; /* default is 0, it is no effect */
+ index++;
+ }
+ boost_table[index].frequency = CPUFREQ_TABLE_END;
+ boost_table[index].boost = 0; /* default is 0, it is no effect */
+
+ /* freqvar_boost data is initialized */
+ data->table = boost_table;
+
+ return 0;
+}
+
+void schedtune_freqvar_boost_enable(int cpu, int index,
+ struct freqvar_boost_data *data, bool enabled)
+{
+ if (enabled) {
+ freqvar_boost_state[cpu].ratio = data->table[index].boost;
+ freqvar_boost_state[cpu].table = data->table;
+ freqvar_boost_state[cpu].enabled = true;
+ } else {
+ freqvar_boost_state[cpu].enabled = false;
+ freqvar_boost_state[cpu].ratio = 0;
+ }
+ return;
+}
+
+int schedtune_freqvar_boost_init(struct cpufreq_policy *policy,
+ struct freqvar_boost_data *data)
+{
+ struct device_node *dn = NULL;
+ int cur_index = cpufreq_frequency_table_get_index(policy, policy->cur);
+ int cpu;
+
+ if (!freqvar_boost_state[policy->cpu].table) {
+ /* find device node */
+ if (schedtune_freqvar_find_node(&dn, policy))
+ return 0;
+ /* copy cpu frequency table */
+ if (schedtune_freqvar_init_table(policy, data))
+ return 0;
+ /* update boost value from dt */
+ if (schedtune_freqvar_parse_dt(dn, data))
+ goto free;
+ } else {
+ data->table = freqvar_boost_state[policy->cpu].table;
+ }
+ /* enable freqvar boost */
+ for_each_cpu(cpu, policy->related_cpus)
+ schedtune_freqvar_boost_enable(cpu, cur_index, data, true);
+
+ return 0;
+
+free:
+ pr_err("SchedTune: faile to initialize\n");
+ kfree(data->table);
+
+ return 0;
+}
+
+int schedtune_freqvar_boost_exit(struct cpufreq_policy *policy,
+ struct freqvar_boost_data *data)
+{
+ int cpu;
+
+ for_each_cpu(cpu, policy->related_cpus)
+ schedtune_freqvar_boost_enable(cpu, 0, data, false);
+
+ return 0;
+}
+
+static struct notifier_block schedtune_freqvar_cpufreq_notifier = {
+ .notifier_call = schedtune_freqvar_cpufreq_callback,
+};
+
+static int __init schedtune_freqvar_register_cpufreq_noti(void)
+{
+ return cpufreq_register_notifier(&schedtune_freqvar_cpufreq_notifier,
+ CPUFREQ_TRANSITION_NOTIFIER);
+}
+core_initcall(schedtune_freqvar_register_cpufreq_noti);
+#endif /* CONFIG_FREQVAR_SCHEDTUNE */
+
+int
+sysctl_sched_cfs_boost_handler(struct ctl_table *table, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *ppos)
+{
+ int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+
+ if (ret || !write)
+ return ret;
+
+ return 0;
+}
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h
new file mode 100755
index 0000000..9ef9416
--- /dev/null
+++ b/kernel/sched/tune.h
@@ -0,0 +1,90 @@
+
+#ifdef CONFIG_SCHED_TUNE
+
+#ifdef CONFIG_FREQVAR_SCHEDTUNE
+#include <linux/cpufreq.h>
+#include <linux/cpumask.h>
+#include <linux/of.h>
+
+#define SCHEDTUNE_LOAD_BOOST_UTIT 10
+
+struct freqvar_boost_state {
+ struct freqvar_boost_table *table;
+ bool enabled; /* boost enabled */
+ int ratio; /* current boost ratio */
+};
+
+struct freqvar_boost_table {
+ int frequency;
+ int boost;
+};
+
+struct freqvar_boost_data {
+ struct freqvar_boost_table *table;
+};
+
+int schedtune_freqvar_boost(int cpu);
+int schedtune_freqvar_boost_init(struct cpufreq_policy *policy, struct freqvar_boost_data *data);
+int schedtune_freqvar_boost_exit(struct cpufreq_policy *policy, struct freqvar_boost_data *data);
+int schedtune_freqvar_update_table(unsigned int *src, int src_size,
+ struct freqvar_boost_table *dst);
+#else
+static inline int schedtune_freqvar_boost(int cpu) { return 0; }
+static inline int schedtune_freqvar_boost_init(struct cpufreq_policy *policy,
+ struct freqvar_boost_data *data) { return 0; };
+static inline int schedtune_freqvar_boost_exit(struct cpufreq_policy *policy,
+ struct freqvar_boost_data *data) { return 0; };
+#endif
+
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+ unsigned long min_power;
+ unsigned long max_power;
+ struct reciprocal_value rdiv;
+};
+
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+
+int schedtune_cpu_boost(int cpu);
+int schedtune_task_boost(struct task_struct *tsk);
+
+int schedtune_prefer_idle(struct task_struct *tsk);
+
+void schedtune_exit_task(struct task_struct *tsk);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
+#else /* CONFIG_CGROUP_SCHEDTUNE */
+
+#define schedtune_cpu_boost(cpu) get_sysctl_sched_cfs_boost()
+#define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#endif /* CONFIG_CGROUP_SCHEDTUNE */
+
+int schedtune_normalize_energy(int energy);
+int schedtune_accept_deltas(int nrg_delta, int cap_delta,
+ struct task_struct *task);
+
+#else /* CONFIG_SCHED_TUNE */
+
+#define schedtune_cpu_boost(cpu) 0
+#define schedtune_task_boost(tsk) 0
+
+#define schedtune_exit_task(task) do { } while (0)
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#define schedtune_accept_deltas(nrg_delta, cap_delta, task) nrg_delta
+
+#endif /* CONFIG_SCHED_TUNE */
diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c
old mode 100644
new mode 100755
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100755
index 0000000..8d25ffb
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,900 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ * and Todd Kjos
+ */
+
+#include <linux/syscore_ops.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT 0
+#define WINDOW_STATS_MAX 1
+#define WINDOW_STATS_MAX_RECENT_AVG 2
+#define WINDOW_STATS_AVG 3
+#define WINDOW_STATS_INVALID_POLICY 4
+
+#define EXITING_TASK_MARKER 0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+ WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;
+
+/*
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
+ */
+__read_mostly unsigned int walt_ravg_window =
+ (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+ return p->ravg.demand;
+}
+
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+ rq->cum_window_demand += delta;
+ if (unlikely((s64)rq->cum_window_demand < 0))
+ rq->cum_window_demand = 0;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg += p->ravg.demand;
+
+ /*
+ * Add a task's contribution to the cumulative window demand when
+ *
+ * (1) task is enqueued with on_rq = 1 i.e migration,
+ * prio/cgroup/class change.
+ * (2) task is waking for the first time in this window.
+ */
+ if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+ fixup_cum_window_demand(rq, p->ravg.demand);
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p)
+{
+ rq->cumulative_runnable_avg -= p->ravg.demand;
+ BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+ /*
+ * on_rq will be 1 for sleeping tasks. So check if the task
+ * is migrating or dequeuing in RUNNING state to change the
+ * prio/cgroup/class.
+ */
+ if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+ fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+ struct task_struct *p, u64 new_task_load)
+{
+ s64 task_load_delta = (s64)new_task_load - task_load(p);
+
+ rq->cumulative_runnable_avg += task_load_delta;
+ if ((s64)rq->cumulative_runnable_avg < 0)
+ panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+ task_load_delta, task_load(p));
+
+ fixup_cum_window_demand(rq, task_load_delta);
+}
+
+u64 walt_ktime_clock(void)
+{
+ if (unlikely(walt_ktime_suspended))
+ return ktime_to_ns(ktime_last);
+ return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+ walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+ ktime_last = ktime_get();
+ walt_ktime_suspended = true;
+ return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+ .resume = walt_resume,
+ .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+ register_syscore_ops(&walt_syscore_ops);
+ return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+ if (p->flags & PF_EXITING) {
+ if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+ p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+ }
+ return 1;
+ }
+ return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+ unsigned int adj_window;
+ bool no_walt = walt_disabled;
+
+ get_option(&str, &walt_ravg_window);
+
+ /* Adjust for CONFIG_HZ */
+ adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+ /* Warn if we're a bit too far away from the expected window size */
+ WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+ "tick-adjusted window size %u, original was %u\n", adj_window,
+ walt_ravg_window);
+
+ walt_ravg_window = adj_window;
+
+ walt_disabled = walt_disabled ||
+ (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+ walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+ WARN(!no_walt && walt_disabled,
+ "invalid window size, disabling WALT\n");
+
+ return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+ s64 delta;
+ int nr_windows;
+
+ delta = wallclock - rq->window_start;
+ /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+ if (delta < 0) {
+ delta = 0;
+ WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
+ }
+
+ if (delta < walt_ravg_window)
+ return;
+
+ nr_windows = div64_u64(delta, walt_ravg_window);
+ rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+ rq->cum_window_demand = rq->cumulative_runnable_avg;
+}
+
+/*
+ * Translate absolute delta time accounted on a CPU
+ * to a scale where 1024 is the capacity of the most
+ * capable CPU running at FMAX
+ */
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+ unsigned long capcurr = capacity_curr_of(cpu_of(rq));
+
+ return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+ if (!walt_io_is_busy)
+ return 0;
+
+ return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+ u64 delta, u64 wallclock)
+{
+ struct rq *rq = cpu_rq(cpu);
+ unsigned long flags, nr_windows;
+ u64 cur_jiffies_ts;
+
+ raw_spin_lock_irqsave(&rq->lock, flags);
+
+ /*
+ * cputime (wallclock) uses sched_clock so use the same here for
+ * consistency.
+ */
+ delta += sched_clock() - wallclock;
+ cur_jiffies_ts = get_jiffies_64();
+
+ if (is_idle_task(curr))
+ walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+ delta);
+
+ nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+ if (nr_windows) {
+ if (nr_windows < 10) {
+ /* Decay CPU's irqload by 3/4 for each window. */
+ rq->avg_irqload *= (3 * nr_windows);
+ rq->avg_irqload = div64_u64(rq->avg_irqload,
+ 4 * nr_windows);
+ } else {
+ rq->avg_irqload = 0;
+ }
+ rq->avg_irqload += rq->cur_irqload;
+ rq->cur_irqload = 0;
+ }
+
+ rq->cur_irqload += delta;
+ rq->irqload_ts = cur_jiffies_ts;
+ raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+ struct rq *rq = cpu_rq(cpu);
+ s64 delta;
+ delta = get_jiffies_64() - rq->irqload_ts;
+
+ /*
+ * Current context can be preempted by irq and rq->irqload_ts can be
+ * updated by irq context so that delta can be negative.
+ * But this is okay and we can safely return as this means there
+ * was recent irq occurrence.
+ */
+
+ if (delta < WALT_HIGH_IRQ_TIMEOUT)
+ return rq->avg_irqload;
+ else
+ return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+ return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+ u64 irqtime, int event)
+{
+ if (is_idle_task(p)) {
+ /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+ if (event == PICK_NEXT_TASK)
+ return 0;
+
+ /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+ return irqtime || cpu_is_waiting_on_io(rq);
+ }
+
+ if (event == TASK_WAKE)
+ return 0;
+
+ if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+ event == TASK_UPDATE)
+ return 1;
+
+ /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+ return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ int new_window, nr_full_windows = 0;
+ int p_is_curr_task = (p == rq->curr);
+ u64 mark_start = p->ravg.mark_start;
+ u64 window_start = rq->window_start;
+ u32 window_size = walt_ravg_window;
+ u64 delta;
+
+ new_window = mark_start < window_start;
+ if (new_window) {
+ nr_full_windows = div64_u64((window_start - mark_start),
+ window_size);
+ if (p->ravg.active_windows < USHRT_MAX)
+ p->ravg.active_windows++;
+ }
+
+ /* Handle per-task window rollover. We don't care about the idle
+ * task or exiting tasks. */
+ if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+ u32 curr_window = 0;
+
+ if (!nr_full_windows)
+ curr_window = p->ravg.curr_window;
+
+ p->ravg.prev_window = curr_window;
+ p->ravg.curr_window = 0;
+ }
+
+ if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+ /* account_busy_for_cpu_time() = 0, so no update to the
+ * task's current window needs to be made. This could be
+ * for example
+ *
+ * - a wakeup event on a task within the current
+ * window (!new_window below, no action required),
+ * - switching to a new task from idle (PICK_NEXT_TASK)
+ * in a new window where irqtime is 0 and we aren't
+ * waiting on IO */
+
+ if (!new_window)
+ return;
+
+ /* A new window has started. The RQ demand must be rolled
+ * over if p is the current task. */
+ if (p_is_curr_task) {
+ u64 prev_sum = 0;
+
+ /* p is either idle task or an exiting task */
+ if (!nr_full_windows) {
+ prev_sum = rq->curr_runnable_sum;
+ }
+
+ rq->prev_runnable_sum = prev_sum;
+ rq->curr_runnable_sum = 0;
+ }
+
+ return;
+ }
+
+ if (!new_window) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. No rollover
+ * since we didn't start a new window. An example of this is
+ * when a task starts execution and then sleeps within the
+ * same window. */
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+ delta = wallclock - mark_start;
+ else
+ delta = irqtime;
+ delta = scale_exec_time(delta, rq);
+ rq->curr_runnable_sum += delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window += delta;
+
+ return;
+ }
+
+ if (!p_is_curr_task) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has also started, but p is not the current task, so the
+ * window is not rolled over - just split up and account
+ * as necessary into curr and prev. The window is only
+ * rolled over when a new window is processed for the current
+ * task.
+ *
+ * Irqtime can't be accounted by a task that isn't the
+ * currently running task. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window += delta;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!exiting_task(p))
+ p->ravg.prev_window = delta;
+ }
+ rq->prev_runnable_sum += delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum += delta;
+ if (!exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. If any of these three above conditions are true
+ * then this busy time can't be accounted as irqtime.
+ *
+ * Busy time for the idle task or exiting tasks need not
+ * be accounted.
+ *
+ * An example of this would be a task that starts execution
+ * and then sleeps once a new window has begun. */
+
+ if (!nr_full_windows) {
+ /* A full window hasn't elapsed, account partial
+ * contribution to previous completed window. */
+ delta = scale_exec_time(window_start - mark_start, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window += delta;
+
+ delta += rq->curr_runnable_sum;
+ } else {
+ /* Since at least one full window has elapsed,
+ * the contribution to the previous window is the
+ * full window (window_size). */
+ delta = scale_exec_time(window_size, rq);
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.prev_window = delta;
+
+ }
+ /*
+ * Rollover for normal runnable sum is done here by overwriting
+ * the values in prev_runnable_sum and curr_runnable_sum.
+ * Rollover for new task runnable sum has completed by previous
+ * if-else statement.
+ */
+ rq->prev_runnable_sum = delta;
+
+ /* Account piece of busy time in the current window. */
+ delta = scale_exec_time(wallclock - window_start, rq);
+ rq->curr_runnable_sum = delta;
+ if (!is_idle_task(p) && !exiting_task(p))
+ p->ravg.curr_window = delta;
+
+ return;
+ }
+
+ if (irqtime) {
+ /* account_busy_for_cpu_time() = 1 so busy time needs
+ * to be accounted to the current window. A new window
+ * has started and p is the current task so rollover is
+ * needed. The current task must be the idle task because
+ * irqtime is not accounted for any other task.
+ *
+ * Irqtime will be accounted each time we process IRQ activity
+ * after a period of idleness, so we know the IRQ busy time
+ * started at wallclock - irqtime. */
+
+ BUG_ON(!is_idle_task(p));
+ mark_start = wallclock - irqtime;
+
+ /* Roll window over. If IRQ busy time was just in the current
+ * window then that is all that need be accounted. */
+ rq->prev_runnable_sum = rq->curr_runnable_sum;
+ if (mark_start > window_start) {
+ rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+ return;
+ }
+
+ /* The IRQ busy time spanned multiple windows. Process the
+ * busy time preceding the current window start first. */
+ delta = window_start - mark_start;
+ if (delta > window_size)
+ delta = window_size;
+ delta = scale_exec_time(delta, rq);
+ rq->prev_runnable_sum += delta;
+
+ /* Process the remaining IRQ busy time in the current window. */
+ delta = wallclock - window_start;
+ rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+ return;
+ }
+
+ BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+ /* No need to bother updating task demand for exiting tasks
+ * or the idle task. */
+ if (exiting_task(p) || is_idle_task(p))
+ return 0;
+
+ /* When a task is waking up it is completing a segment of non-busy
+ * time. Likewise, if wait time is not treated as busy time, then
+ * when a task begins to run or is migrated, it is not running and
+ * is completing a segment of non-busy time. */
+ if (event == TASK_WAKE || (!walt_account_wait_time &&
+ (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+ u32 runtime, int samples, int event)
+{
+ u32 *hist = &p->ravg.sum_history[0];
+ int ridx, widx;
+ u32 max = 0, avg, demand;
+ u64 sum = 0;
+
+ /* Ignore windows where task had no activity */
+ if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+ goto done;
+
+ /* Push new 'runtime' value onto stack */
+ widx = walt_ravg_hist_size - 1;
+ ridx = widx - samples;
+ for (; ridx >= 0; --widx, --ridx) {
+ hist[widx] = hist[ridx];
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+ hist[widx] = runtime;
+ sum += hist[widx];
+ if (hist[widx] > max)
+ max = hist[widx];
+ }
+
+ p->ravg.sum = 0;
+
+ if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+ demand = runtime;
+ } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+ demand = max;
+ } else {
+ avg = div64_u64(sum, walt_ravg_hist_size);
+ if (walt_window_stats_policy == WINDOW_STATS_AVG)
+ demand = avg;
+ else
+ demand = max(avg, runtime);
+ }
+
+ /*
+ * A throttled deadline sched class task gets dequeued without
+ * changing p->on_rq. Since the dequeue decrements hmp stats
+ * avoid decrementing it here again.
+ *
+ * When window is rolled over, the cumulative window demand
+ * is reset to the cumulative runnable average (contribution from
+ * the tasks on the runqueue). If the current task is dequeued
+ * already, it's demand is not included in the cumulative runnable
+ * average. So add the task demand separately to cumulative window
+ * demand.
+ */
+ if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+ if (task_on_rq_queued(p))
+ fixup_cumulative_runnable_avg(rq, p, demand);
+ else if (rq->curr == p)
+ fixup_cum_window_demand(rq, demand);
+ }
+
+ p->ravg.demand = demand;
+
+done:
+ trace_walt_update_history(rq, p, runtime, samples, event);
+ return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+ u64 delta)
+{
+ delta = scale_exec_time(delta, rq);
+ p->ravg.sum += delta;
+ if (unlikely(p->ravg.sum > walt_ravg_window))
+ p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ * a) Task event is contained within one window.
+ * window_start < mark_start < wallclock
+ *
+ * ws ms wc
+ * | | |
+ * V V V
+ * |---------------|
+ *
+ * In this case, p->ravg.sum is updated *iff* event is appropriate
+ * (ex: event == PUT_PREV_TASK)
+ *
+ * b) Task event spans two windows.
+ * mark_start < window_start < wallclock
+ *
+ * ms ws wc
+ * | | |
+ * V V V
+ * -----|-------------------
+ *
+ * In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ * is appropriate, then a new window sample is recorded followed
+ * by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ * c) Task event spans more than two windows.
+ *
+ * ms ws_tmp ws wc
+ * | | | |
+ * V V V V
+ * ---|-------|-------|-------|-------|------
+ * | |
+ * |<------ nr_full_windows ------>|
+ *
+ * In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ * event is appropriate, window sample of p->ravg.sum is recorded,
+ * 'nr_full_window' samples of window_size is also recorded *iff*
+ * event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ * *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock)
+{
+ u64 mark_start = p->ravg.mark_start;
+ u64 delta, window_start = rq->window_start;
+ int new_window, nr_full_windows;
+ u32 window_size = walt_ravg_window;
+
+ new_window = mark_start < window_start;
+ if (!account_busy_for_task_demand(p, event)) {
+ if (new_window)
+ /* If the time accounted isn't being accounted as
+ * busy time, and a new window started, only the
+ * previous window need be closed out with the
+ * pre-existing demand. Multiple windows may have
+ * elapsed, but since empty windows are dropped,
+ * it is not necessary to account those. */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ return;
+ }
+
+ if (!new_window) {
+ /* The simple case - busy time contained within the existing
+ * window. */
+ add_to_task_demand(rq, p, wallclock - mark_start);
+ return;
+ }
+
+ /* Busy time spans at least two windows. Temporarily rewind
+ * window_start to first window boundary after mark_start. */
+ delta = window_start - mark_start;
+ nr_full_windows = div64_u64(delta, window_size);
+ window_start -= (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (window_start - mark_start) first */
+ add_to_task_demand(rq, p, window_start - mark_start);
+
+ /* Push new sample(s) into task's demand history */
+ update_history(rq, p, p->ravg.sum, 1, event);
+ if (nr_full_windows)
+ update_history(rq, p, scale_exec_time(window_size, rq),
+ nr_full_windows, event);
+
+ /* Roll window_start back to current to process any remainder
+ * in current window. */
+ window_start += (u64)nr_full_windows * (u64)window_size;
+
+ /* Process (wallclock - window_start) next */
+ mark_start = window_start;
+ add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime)
+{
+ if (walt_disabled || !rq->window_start)
+ return;
+
+ lockdep_assert_held(&rq->lock);
+
+ update_window_start(rq, wallclock);
+
+ if (!p->ravg.mark_start)
+ goto done;
+
+ update_task_demand(p, rq, event, wallclock);
+ update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+ trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+ p->ravg.mark_start = wallclock;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+ u32 sum = 0;
+
+ if (exiting_task(p))
+ sum = EXITING_TASK_MARKER;
+
+ memset(&p->ravg, 0, sizeof(struct ravg));
+ /* Retain EXITING_TASK marker */
+ p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+ u64 wallclock;
+ struct rq *rq = task_rq(p);
+
+ if (!rq->window_start) {
+ reset_task_stats(p);
+ return;
+ }
+
+ wallclock = walt_ktime_clock();
+ p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq)
+{
+ int cpu = cpu_of(rq);
+ struct rq *sync_rq = cpu_rq(sync_cpu);
+
+ if (likely(rq->window_start))
+ return;
+
+ if (cpu == sync_cpu) {
+ rq->window_start = 1;
+ } else {
+ raw_spin_unlock(&rq->lock);
+ double_rq_lock(rq, sync_rq);
+ rq->window_start = cpu_rq(sync_cpu)->window_start;
+ rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+ raw_spin_unlock(&sync_rq->lock);
+ }
+
+ rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+ if (cpu == sync_cpu)
+ sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+ struct rq *src_rq = task_rq(p);
+ struct rq *dest_rq = cpu_rq(new_cpu);
+ u64 wallclock;
+
+ if (!p->on_rq && p->state != TASK_WAKING)
+ return;
+
+ if (exiting_task(p)) {
+ return;
+ }
+
+ if (p->state == TASK_WAKING)
+ double_rq_lock(src_rq, dest_rq);
+
+ wallclock = walt_ktime_clock();
+
+ walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+ TASK_UPDATE, wallclock, 0);
+ walt_update_task_ravg(dest_rq->curr, dest_rq,
+ TASK_UPDATE, wallclock, 0);
+
+ walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+ /*
+ * When a task is migrating during the wakeup, adjust
+ * the task's contribution towards cumulative window
+ * demand.
+ */
+ if (p->state == TASK_WAKING &&
+ p->last_sleep_ts >= src_rq->window_start) {
+ fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+ fixup_cum_window_demand(dest_rq, p->ravg.demand);
+ }
+
+ if (p->ravg.curr_window) {
+ src_rq->curr_runnable_sum -= p->ravg.curr_window;
+ dest_rq->curr_runnable_sum += p->ravg.curr_window;
+ }
+
+ if (p->ravg.prev_window) {
+ src_rq->prev_runnable_sum -= p->ravg.prev_window;
+ dest_rq->prev_runnable_sum += p->ravg.prev_window;
+ }
+
+ if ((s64)src_rq->prev_runnable_sum < 0) {
+ src_rq->prev_runnable_sum = 0;
+ WARN_ON(1);
+ }
+ if ((s64)src_rq->curr_runnable_sum < 0) {
+ src_rq->curr_runnable_sum = 0;
+ WARN_ON(1);
+ }
+
+ trace_walt_migration_update_sum(src_rq, p);
+ trace_walt_migration_update_sum(dest_rq, p);
+
+ if (p->state == TASK_WAKING)
+ double_rq_unlock(src_rq, dest_rq);
+}
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+ int i;
+ u32 init_load_windows =
+ div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+ (u64)walt_ravg_window, 100);
+ u32 init_load_pct = current->init_load_pct;
+
+ p->init_load_pct = 0;
+ memset(&p->ravg, 0, sizeof(struct ravg));
+
+ if (init_load_pct) {
+ init_load_windows = div64_u64((u64)init_load_pct *
+ (u64)walt_ravg_window, 100);
+ }
+
+ p->ravg.demand = init_load_windows;
+ for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+ p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100755
index 0000000..de7edac
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+ u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq);
+void walt_migrate_sync_cpu(int cpu);
+void walt_init_cpu_efficiency(void);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+ u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+ int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+ struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline void walt_init_cpu_efficiency(void) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#define walt_cpu_high_irqload(cpu) false
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern bool walt_disabled;
+
+#endif
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
old mode 100644
new mode 100755
index 9a9203b..a39f81c
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -175,7 +175,7 @@
*
* Returns valid seccomp BPF response codes.
*/
-static u32 seccomp_run_filters(struct seccomp_data *sd)
+static u32 seccomp_run_filters(const struct seccomp_data *sd)
{
struct seccomp_data sd_local;
u32 ret = SECCOMP_RET_ALLOW;
@@ -579,20 +579,10 @@
BUG();
}
#else
-int __secure_computing(void)
-{
- u32 phase1_result = seccomp_phase1(NULL);
-
- if (likely(phase1_result == SECCOMP_PHASE1_OK))
- return 0;
- else if (likely(phase1_result == SECCOMP_PHASE1_SKIP))
- return -1;
- else
- return seccomp_phase2(phase1_result);
-}
#ifdef CONFIG_SECCOMP_FILTER
-static u32 __seccomp_phase1_filter(int this_syscall, struct seccomp_data *sd)
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+ const bool recheck_after_trace)
{
u32 filter_ret, action;
int data;
@@ -624,10 +614,50 @@
goto skip;
case SECCOMP_RET_TRACE:
- return filter_ret; /* Save the rest for phase 2. */
+ /* We've been put in this state by the ptracer already. */
+ if (recheck_after_trace)
+ return 0;
+
+ /* ENOSYS these calls if there is no tracer attached. */
+ if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
+ syscall_set_return_value(current,
+ task_pt_regs(current),
+ -ENOSYS, 0);
+ goto skip;
+ }
+
+ /* Allow the BPF to provide the event message */
+ ptrace_event(PTRACE_EVENT_SECCOMP, data);
+ /*
+ * The delivery of a fatal signal during event
+ * notification may silently skip tracer notification,
+ * which could leave us with a potentially unmodified
+ * syscall that the tracer would have liked to have
+ * changed. Since the process is about to die, we just
+ * force the syscall to be skipped and let the signal
+ * kill the process and correctly handle any tracer exit
+ * notifications.
+ */
+ if (fatal_signal_pending(current))
+ goto skip;
+ /* Check if the tracer forced the syscall to be skipped. */
+ this_syscall = syscall_get_nr(current, task_pt_regs(current));
+ if (this_syscall < 0)
+ goto skip;
+
+ /*
+ * Recheck the syscall, since it may have changed. This
+ * intentionally uses a NULL struct seccomp_data to force
+ * a reload of all registers. This does not goto skip since
+ * a skip would have already been reported.
+ */
+ if (__seccomp_filter(this_syscall, NULL, true))
+ return -1;
+
+ return 0;
case SECCOMP_RET_ALLOW:
- return SECCOMP_PHASE1_OK;
+ return 0;
case SECCOMP_RET_KILL:
default:
@@ -639,96 +669,38 @@
skip:
audit_seccomp(this_syscall, 0, action);
- return SECCOMP_PHASE1_SKIP;
+ return -1;
+}
+#else
+static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
+ const bool recheck_after_trace)
+{
+ BUG();
}
#endif
-/**
- * seccomp_phase1() - run fast path seccomp checks on the current syscall
- * @arg sd: The seccomp_data or NULL
- *
- * This only reads pt_regs via the syscall_xyz helpers. The only change
- * it will make to pt_regs is via syscall_set_return_value, and it will
- * only do that if it returns SECCOMP_PHASE1_SKIP.
- *
- * If sd is provided, it will not read pt_regs at all.
- *
- * It may also call do_exit or force a signal; these actions must be
- * safe.
- *
- * If it returns SECCOMP_PHASE1_OK, the syscall passes checks and should
- * be processed normally.
- *
- * If it returns SECCOMP_PHASE1_SKIP, then the syscall should not be
- * invoked. In this case, seccomp_phase1 will have set the return value
- * using syscall_set_return_value.
- *
- * If it returns anything else, then the return value should be passed
- * to seccomp_phase2 from a context in which ptrace hooks are safe.
- */
-u32 seccomp_phase1(struct seccomp_data *sd)
+int __secure_computing(const struct seccomp_data *sd)
{
int mode = current->seccomp.mode;
- int this_syscall = sd ? sd->nr :
- syscall_get_nr(current, task_pt_regs(current));
+ int this_syscall;
if (config_enabled(CONFIG_CHECKPOINT_RESTORE) &&
unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
- return SECCOMP_PHASE1_OK;
+ return 0;
+
+ this_syscall = sd ? sd->nr :
+ syscall_get_nr(current, task_pt_regs(current));
switch (mode) {
case SECCOMP_MODE_STRICT:
__secure_computing_strict(this_syscall); /* may call do_exit */
- return SECCOMP_PHASE1_OK;
-#ifdef CONFIG_SECCOMP_FILTER
+ return 0;
case SECCOMP_MODE_FILTER:
- return __seccomp_phase1_filter(this_syscall, sd);
-#endif
+ return __seccomp_filter(this_syscall, sd, false);
default:
BUG();
}
}
-
-/**
- * seccomp_phase2() - finish slow path seccomp work for the current syscall
- * @phase1_result: The return value from seccomp_phase1()
- *
- * This must be called from a context in which ptrace hooks can be used.
- *
- * Returns 0 if the syscall should be processed or -1 to skip the syscall.
- */
-int seccomp_phase2(u32 phase1_result)
-{
- struct pt_regs *regs = task_pt_regs(current);
- u32 action = phase1_result & SECCOMP_RET_ACTION;
- int data = phase1_result & SECCOMP_RET_DATA;
-
- BUG_ON(action != SECCOMP_RET_TRACE);
-
- audit_seccomp(syscall_get_nr(current, regs), 0, action);
-
- /* Skip these calls if there is no tracer. */
- if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
- syscall_set_return_value(current, regs,
- -ENOSYS, 0);
- return -1;
- }
-
- /* Allow the BPF to provide the event message */
- ptrace_event(PTRACE_EVENT_SECCOMP, data);
- /*
- * The delivery of a fatal signal during event
- * notification may silently skip tracer notification.
- * Terminating the task now avoids executing a system
- * call that may not be intended.
- */
- if (fatal_signal_pending(current))
- do_exit(SIGSYS);
- if (syscall_get_nr(current, regs) < 0)
- return -1; /* Explicit request to skip. */
-
- return 0;
-}
#endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
long prctl_get_seccomp(void)
diff --git a/kernel/signal.c b/kernel/signal.c
old mode 100644
new mode 100755
index 96e8c3c..93b7ad3
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -45,6 +45,10 @@
#include <asm/cacheflush.h>
#include "audit.h" /* audit_signal_info() */
+#ifdef CONFIG_SAMSUNG_FREECESS
+#include <linux/freecess.h>
+#endif
+
/*
* SLAB caches for signal bits.
*/
@@ -1029,6 +1033,13 @@
int override_rlimit;
int ret = 0, result;
+ /* debugging PF_UR case */
+ if (sig == SIGSEGV && ((info == SEND_SIG_NOINFO)||(info == SEND_SIG_PRIV)||(info == SEND_SIG_FORCED))) {
+ pr_info("Trying to send signal(11) to %s(%d).\n",
+ t->comm,t->pid);
+ dump_stack();
+ }
+
assert_spin_locked(&t->sighand->siglock);
result = TRACE_SIGNAL_IGNORED;
@@ -1188,6 +1199,24 @@
{
unsigned long flags;
int ret = -ESRCH;
+#ifdef CONFIG_OLAF_SUPPORT
+ struct task_struct *t;
+ if ((sig == SIGKILL || sig == SIGTERM || sig == SIGABRT || sig == SIGQUIT)) {
+ for_each_thread(p, t) {
+ if (!strncmp(t->comm, "Jit ", strlen("Jit "))) {
+ if (t->flags & PF_FROZEN) {
+ t->flags |= PF_NOFREEZE;
+ __thaw_task(t);
+ }
+ }
+ }
+ }
+#endif
+
+#ifdef CONFIG_SAMSUNG_FREECESS
+ if ((sig == SIGKILL || sig == SIGTERM || sig == SIGABRT || sig == SIGQUIT))
+ sig_report(current, p);
+#endif
if (lock_task_sighand(p, &flags)) {
ret = send_signal(sig, info, p, group);
diff --git a/kernel/smp.c b/kernel/smp.c
old mode 100644
new mode 100755
index d903c02..d131bbb
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -495,10 +495,29 @@
}
EXPORT_SYMBOL(smp_call_function);
+/* control working core by early param */
+unsigned long sec_cpumask;
+static int __init sec_core_masking(char *s)
+{
+ long mask;
+ int ret;
+
+ ret = kstrtol(s, 16, &mask);
+ if (ret)
+ return -1;
+
+ sec_cpumask = (unsigned long)mask;
+
+ return 0;
+}
+early_param("sec_coremask", sec_core_masking);
+
/* Setup configured maximum number of CPUs to activate */
unsigned int setup_max_cpus = NR_CPUS;
EXPORT_SYMBOL(setup_max_cpus);
+struct cpumask early_cpu_mask;
+EXPORT_SYMBOL(early_cpu_mask);
/*
* Setup routine for controlling SMP activation
@@ -570,12 +589,25 @@
idle_threads_init();
+ cpumask_clear(&early_cpu_mask);
+ for_each_cpu(cpu, topology_idle_cpumask(0))
+ clear_bit(cpu, &sec_cpumask);
+
+ cpumask_set_cpu(0, &early_cpu_mask);
/* FIXME: This should be done in userspace --RR */
for_each_present_cpu(cpu) {
if (num_online_cpus() >= setup_max_cpus)
break;
- if (!cpu_online(cpu))
+
+ if (test_bit(cpu, &sec_cpumask)) {
+ pr_err("%s: CPU%d OFF by sec coremask\n", __func__, cpu);
+ continue;
+ }
+
+ if (!cpu_online(cpu)) {
+ cpumask_set_cpu(cpu, &early_cpu_mask);
cpu_up(cpu);
+ }
}
/* Any cleanup work */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
old mode 100644
new mode 100755
diff --git a/kernel/smpboot.h b/kernel/smpboot.h
old mode 100644
new mode 100755
diff --git a/kernel/softirq.c b/kernel/softirq.c
old mode 100644
new mode 100755
index 479e443..f28b40a
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -26,10 +26,16 @@
#include <linux/smpboot.h>
#include <linux/tick.h>
#include <linux/irq.h>
+#include <linux/exynos-ss.h>
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+#include <linux/sec_debug.h>
+#endif
#define CREATE_TRACE_POINTS
#include <trace/events/irq.h>
+#include <linux/nmi.h>
+
/*
- No shared variables, all the data are CPU local.
- If a softirq needs serialization, let it serialize itself
@@ -227,7 +233,7 @@
static inline void lockdep_softirq_end(bool in_hardirq) { }
#endif
-asmlinkage __visible void __do_softirq(void)
+asmlinkage __visible void __softirq_entry __do_softirq(void)
{
unsigned long end = jiffies + MAX_SOFTIRQ_TIME;
unsigned long old_flags = current->flags;
@@ -270,7 +276,11 @@
kstat_incr_softirqs_this_cpu(vec_nr);
trace_softirq_entry(vec_nr);
+ exynos_ss_irq(ESS_FLAG_SOFTIRQ, h->action, irqs_disabled(), ESS_FLAG_IN);
+ sl_softirq_entry(softirq_to_name[vec_nr], h->action);
h->action(h);
+ sl_softirq_exit();
+ exynos_ss_irq(ESS_FLAG_SOFTIRQ, h->action, irqs_disabled(), ESS_FLAG_OUT);
trace_softirq_exit(vec_nr);
if (unlikely(prev_count != preempt_count())) {
pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n",
@@ -502,7 +512,19 @@
if (!test_and_clear_bit(TASKLET_STATE_SCHED,
&t->state))
BUG();
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+ sec_debug_irq_sched_log(-1, t->func, 3);
+#endif
+ exynos_ss_irq(ESS_FLAG_SOFTIRQ_TASKLET,
+ t->func, irqs_disabled(), ESS_FLAG_IN);
+ sl_softirq_entry(softirq_to_name[TASKLET_SOFTIRQ], t->func);
t->func(t->data);
+ sl_softirq_exit();
+ exynos_ss_irq(ESS_FLAG_SOFTIRQ_TASKLET,
+ t->func, irqs_disabled(), ESS_FLAG_OUT);
+#ifdef CONFIG_SEC_DUMP_SUMMARY
+ sec_debug_irq_sched_log(-1, t->func, 4);
+#endif
tasklet_unlock(t);
continue;
}
@@ -538,7 +560,13 @@
if (!test_and_clear_bit(TASKLET_STATE_SCHED,
&t->state))
BUG();
+ exynos_ss_irq(ESS_FLAG_SOFTIRQ_HI_TASKLET,
+ t->func, irqs_disabled(), ESS_FLAG_IN);
+ sl_softirq_entry(softirq_to_name[HI_SOFTIRQ], t->func);
t->func(t->data);
+ sl_softirq_exit();
+ exynos_ss_irq(ESS_FLAG_SOFTIRQ_HI_TASKLET,
+ t->func, irqs_disabled(), ESS_FLAG_OUT);
tasklet_unlock(t);
continue;
}
diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
old mode 100644
new mode 100755
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
old mode 100644
new mode 100755
index a3bbaee..ef18196
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -210,6 +210,15 @@
}
ack_state(msdata);
}
+
+#ifdef CONFIG_ARM64
+ if (msdata->state == curstate)
+ wfe();
+ else {
+ dsb(sy);
+ sev();
+ }
+#endif
} while (curstate != MULTI_STOP_EXIT);
local_irq_restore(flags);
@@ -256,10 +265,7 @@
{
struct cpu_stop_done done;
struct cpu_stop_work work1, work2;
- struct multi_stop_data msdata;
-
- preempt_disable();
- msdata = (struct multi_stop_data){
+ struct multi_stop_data msdata = {
.fn = fn,
.data = arg,
.num_threads = 2,
@@ -282,10 +288,7 @@
return -ENOENT;
}
- preempt_enable();
-
wait_for_completion(&done.completion);
-
return done.executed ? done.ret : -ENOENT;
}
diff --git a/kernel/sys.c b/kernel/sys.c
old mode 100644
new mode 100755
index e2446ad..d017476
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -41,6 +41,8 @@
#include <linux/syscore_ops.h>
#include <linux/version.h>
#include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
#include <linux/compat.h>
#include <linux/syscalls.h>
@@ -63,6 +65,10 @@
#include <asm/io.h>
#include <asm/unistd.h>
+#ifdef CONFIG_SECURITY_DEFEX
+#include <linux/defex.h>
+#endif
+
#ifndef SET_UNALIGN_CTL
# define SET_UNALIGN_CTL(a, b) (-EINVAL)
#endif
@@ -128,6 +134,54 @@
EXPORT_SYMBOL(fs_overflowuid);
EXPORT_SYMBOL(fs_overflowgid);
+#if defined CONFIG_SEC_RESTRICT_SETUID
+int sec_check_execpath(struct mm_struct *mm, char *denypath);
+#if defined CONFIG_SEC_RESTRICT_ROOTING_LOG
+#define PRINT_LOG(...) printk(KERN_ERR __VA_ARGS__)
+#else
+#define PRINT_LOG(...)
+#endif // End of CONFIG_SEC_RESTRICT_ROOTING_LOG
+
+static int sec_restrict_uid(void)
+{
+ int ret = 0;
+ struct task_struct *parent_tsk;
+ const struct cred *parent_cred;
+
+ read_lock(&tasklist_lock);
+ parent_tsk = current->parent;
+ if (!parent_tsk) {
+ read_unlock(&tasklist_lock);
+ return 0;
+ }
+
+ get_task_struct(parent_tsk);
+ /* holding on to the task struct is enough so just release
+ * the tasklist lock here */
+ read_unlock(&tasklist_lock);
+
+ parent_cred = get_task_cred(parent_tsk);
+ if (!parent_cred)
+ goto out;
+ if (parent_cred->euid.val == 0 || parent_tsk->pid == 1) {
+ ret = 0;
+ } else if (sec_check_execpath(current->mm, "/system/bin/pppd")) {
+ PRINT_LOG("VPN allowed to use root permission");
+ ret = 0;
+ } else {
+ PRINT_LOG("Restricted changing UID. PID = %d(%s) PPID = %d(%s)\n",
+ current->pid, current->comm,
+ parent_tsk->pid, parent_tsk->comm);
+ ret = 1;
+ }
+ put_cred(parent_cred);
+out:
+ put_task_struct(parent_tsk);
+
+ return ret;
+}
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
+
/*
* Returns true if current's euid is same as p's uid or euid,
* or has CAP_SYS_NICE to p's user_ns.
@@ -335,14 +389,22 @@
struct cred *new;
int retval;
kgid_t krgid, kegid;
-
+
krgid = make_kgid(ns, rgid);
kegid = make_kgid(ns, egid);
-
+
if ((rgid != (gid_t) -1) && !gid_valid(krgid))
return -EINVAL;
if ((egid != (gid_t) -1) && !gid_valid(kegid))
- return -EINVAL;
+ return -EINVAL;
+
+#if defined CONFIG_SEC_RESTRICT_SETUID
+ if (krgid.val == 0 || kegid.val == 0)
+ {
+ if(sec_restrict_uid())
+ return -EACCES;
+ }
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
new = prepare_creds();
if (!new)
@@ -392,11 +454,19 @@
struct cred *new;
int retval;
kgid_t kgid;
-
+
kgid = make_kgid(ns, gid);
if (!gid_valid(kgid))
return -EINVAL;
+#if defined CONFIG_SEC_RESTRICT_SETUID
+ if(kgid.val == 0)
+ {
+ if(sec_restrict_uid())
+ return -EACCES;
+ }
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
+
new = prepare_creds();
if (!new)
return -ENOMEM;
@@ -468,7 +538,7 @@
struct cred *new;
int retval;
kuid_t kruid, keuid;
-
+
kruid = make_kuid(ns, ruid);
keuid = make_kuid(ns, euid);
@@ -477,6 +547,14 @@
if ((euid != (uid_t) -1) && !uid_valid(keuid))
return -EINVAL;
+#if defined CONFIG_SEC_RESTRICT_SETUID
+ if (kruid.val == 0 || keuid.val == 0)
+ {
+ if(sec_restrict_uid())
+ return -EACCES;
+ }
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
+
new = prepare_creds();
if (!new)
return -ENOMEM;
@@ -543,6 +621,14 @@
kuid = make_kuid(ns, uid);
if (!uid_valid(kuid))
return -EINVAL;
+
+#if defined CONFIG_SEC_RESTRICT_SETUID
+ if(kuid.val == 0)
+ {
+ if(sec_restrict_uid())
+ return -EACCES;
+ }
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
new = prepare_creds();
if (!new)
@@ -600,6 +686,14 @@
if ((suid != (uid_t) -1) && !uid_valid(ksuid))
return -EINVAL;
+#if defined CONFIG_SEC_RESTRICT_SETUID
+ if (kruid.val == 0 || keuid.val == 0 || ksuid.val == 0)
+ {
+ if(sec_restrict_uid())
+ return -EACCES;
+ }
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
+
new = prepare_creds();
if (!new)
return -ENOMEM;
@@ -685,6 +779,14 @@
if ((sgid != (gid_t) -1) && !gid_valid(ksgid))
return -EINVAL;
+#if defined CONFIG_SEC_RESTRICT_SETUID
+ if (krgid.val == 0 || kegid.val == 0 || ksgid.val == 0)
+ {
+ if(sec_restrict_uid())
+ return -EACCES;
+ }
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
+
new = prepare_creds();
if (!new)
return -ENOMEM;
@@ -759,6 +861,18 @@
if (!uid_valid(kuid))
return old_fsuid;
+#if defined CONFIG_SEC_RESTRICT_SETUID
+ if (kuid.val == 0) {
+ if (sec_restrict_uid())
+ return -EACCES;
+ }
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
+
+#ifdef CONFIG_SECURITY_DEFEX
+ if (task_defex_enforce(current, NULL, -__NR_setfsuid))
+ return old_fsuid;
+#endif
+
new = prepare_creds();
if (!new)
return old_fsuid;
@@ -797,6 +911,18 @@
kgid = make_kgid(old->user_ns, gid);
if (!gid_valid(kgid))
return old_fsgid;
+
+#if defined CONFIG_SEC_RESTRICT_SETUID
+ if (kgid.val == 0) {
+ if (sec_restrict_uid())
+ return -EACCES;
+ }
+#endif // End of CONFIG_SEC_RESTRICT_SETUID
+
+#ifdef CONFIG_SECURITY_DEFEX
+ if (task_defex_enforce(current, NULL, -__NR_setfsgid))
+ return old_fsgid;
+#endif
new = prepare_creds();
if (!new)
@@ -2070,6 +2196,153 @@
}
#endif
+#ifdef CONFIG_MMU
+static int prctl_update_vma_anon_name(struct vm_area_struct *vma,
+ struct vm_area_struct **prev,
+ unsigned long start, unsigned long end,
+ const char __user *name_addr)
+{
+ struct mm_struct *mm = vma->vm_mm;
+ int error = 0;
+ pgoff_t pgoff;
+
+ if (name_addr == vma_get_anon_name(vma)) {
+ *prev = vma;
+ goto out;
+ }
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma,
+ vma->vm_file, pgoff, vma_policy(vma),
+ vma->vm_userfaultfd_ctx, name_addr);
+ if (*prev) {
+ vma = *prev;
+ goto success;
+ }
+
+ *prev = vma;
+
+ if (start != vma->vm_start) {
+ error = split_vma(mm, vma, start, 1);
+ if (error)
+ goto out;
+ }
+
+ if (end != vma->vm_end) {
+ error = split_vma(mm, vma, end, 0);
+ if (error)
+ goto out;
+ }
+
+success:
+ if (!vma->vm_file)
+ vma->anon_name = name_addr;
+
+out:
+ if (error == -ENOMEM)
+ error = -EAGAIN;
+ return error;
+}
+
+static int prctl_set_vma_anon_name(unsigned long start, unsigned long end,
+ unsigned long arg)
+{
+ unsigned long tmp;
+ struct vm_area_struct *vma, *prev;
+ int unmapped_error = 0;
+ int error = -EINVAL;
+
+ /*
+ * If the interval [start,end) covers some unmapped address
+ * ranges, just ignore them, but return -ENOMEM at the end.
+ * - this matches the handling in madvise.
+ */
+ vma = find_vma_prev(current->mm, start, &prev);
+ if (vma && start > vma->vm_start)
+ prev = vma;
+
+ for (;;) {
+ /* Still start < end. */
+ error = -ENOMEM;
+ if (!vma)
+ return error;
+
+ /* Here start < (end|vma->vm_end). */
+ if (start < vma->vm_start) {
+ unmapped_error = -ENOMEM;
+ start = vma->vm_start;
+ if (start >= end)
+ return error;
+ }
+
+ /* Here vma->vm_start <= start < (end|vma->vm_end) */
+ tmp = vma->vm_end;
+ if (end < tmp)
+ tmp = end;
+
+ /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+ error = prctl_update_vma_anon_name(vma, &prev, start, tmp,
+ (const char __user *)arg);
+ if (error)
+ return error;
+ start = tmp;
+ if (prev && start < prev->vm_end)
+ start = prev->vm_end;
+ error = unmapped_error;
+ if (start >= end)
+ return error;
+ if (prev)
+ vma = prev->vm_next;
+ else /* madvise_remove dropped mmap_sem */
+ vma = find_vma(current->mm, start);
+ }
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ struct mm_struct *mm = current->mm;
+ int error;
+ unsigned long len;
+ unsigned long end;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+ /* Check to see whether len was rounded up from small -ve to zero */
+ if (len_in && !len)
+ return -EINVAL;
+
+ end = start + len;
+ if (end < start)
+ return -EINVAL;
+
+ if (end == start)
+ return 0;
+
+ down_write(&mm->mmap_sem);
+
+ switch (opt) {
+ case PR_SET_VMA_ANON_NAME:
+ error = prctl_set_vma_anon_name(start, end, arg);
+ break;
+ default:
+ error = -EINVAL;
+ }
+
+ up_write(&mm->mmap_sem);
+
+ return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+ unsigned long len_in, unsigned long arg)
+{
+ return -EINVAL;
+}
+#endif
+
int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which)
{
return -EINVAL;
@@ -2085,6 +2358,7 @@
unsigned long, arg4, unsigned long, arg5)
{
struct task_struct *me = current;
+ struct task_struct *tsk;
unsigned char comm[sizeof(me->comm)];
long error;
@@ -2178,7 +2452,10 @@
error = perf_event_task_enable();
break;
case PR_GET_TIMERSLACK:
- error = current->timer_slack_ns;
+ if (current->timer_slack_ns > ULONG_MAX)
+ error = ULONG_MAX;
+ else
+ error = current->timer_slack_ns;
break;
case PR_SET_TIMERSLACK:
if (arg2 <= 0)
@@ -2227,6 +2504,26 @@
case PR_GET_TID_ADDRESS:
error = prctl_get_tid_address(me, (int __user **)arg2);
break;
+ case PR_SET_TIMERSLACK_PID:
+ if (task_pid_vnr(current) != (pid_t)arg3 &&
+ !capable(CAP_SYS_NICE))
+ return -EPERM;
+ rcu_read_lock();
+ tsk = find_task_by_vpid((pid_t)arg3);
+ if (tsk == NULL) {
+ rcu_read_unlock();
+ return -EINVAL;
+ }
+ get_task_struct(tsk);
+ rcu_read_unlock();
+ if (arg2 <= 0)
+ tsk->timer_slack_ns =
+ tsk->default_timer_slack_ns;
+ else
+ tsk->timer_slack_ns = arg2;
+ put_task_struct(tsk);
+ error = 0;
+ break;
case PR_SET_CHILD_SUBREAPER:
me->signal->is_child_subreaper = !!arg2;
break;
@@ -2275,6 +2572,9 @@
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+ case PR_SET_VMA:
+ error = prctl_set_vma(arg2, arg3, arg4, arg5);
+ break;
case PR_GET_SPECULATION_CTRL:
if (arg3 || arg4 || arg5)
return -EINVAL;
diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
old mode 100644
new mode 100755
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
old mode 100644
new mode 100755
index beadcf8..6336abd
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -105,6 +105,7 @@
extern unsigned int core_pipe_limit;
#endif
extern int pid_max;
+extern int extra_free_kbytes;
extern int pid_max_min, pid_max_max;
extern int percpu_pagelist_fraction;
extern int compat_log;
@@ -127,6 +128,9 @@
static int __maybe_unused four = 4;
static unsigned long one_ul = 1;
static int one_hundred = 100;
+#ifdef CONFIG_INCREASE_MAXIMUM_SWAPPINESS
+static int max_swappiness = 200;
+#endif
#ifdef CONFIG_PRINTK
static int ten_thousand = 10000;
#endif
@@ -304,6 +308,36 @@
.extra1 = &min_sched_granularity_ns,
.extra2 = &max_sched_granularity_ns,
},
+#ifdef CONFIG_SCHED_WALT
+ {
+ .procname = "sched_use_walt_cpu_util",
+ .data = &sysctl_sched_use_walt_cpu_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_use_walt_task_util",
+ .data = &sysctl_sched_use_walt_task_util,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_init_task_load_pct",
+ .data = &sysctl_sched_walt_init_task_load_pct,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "sched_walt_cpu_high_irqload",
+ .data = &sysctl_sched_walt_cpu_high_irqload,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
{
.procname = "sched_wakeup_granularity_ns",
.data = &sysctl_sched_wakeup_granularity,
@@ -415,6 +449,15 @@
.mode = 0644,
.proc_handler = sched_rr_handler,
},
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ {
+ .procname = "sysctl_sched_restrict_cluster_spill",
+ .data = &sysctl_sched_restrict_cluster_spill,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+#endif
#ifdef CONFIG_SCHED_AUTOGROUP
{
.procname = "sched_autogroup_enabled",
@@ -436,6 +479,21 @@
.extra1 = &one,
},
#endif
+#ifdef CONFIG_SCHED_TUNE
+ {
+ .procname = "sched_cfs_boost",
+ .data = &sysctl_sched_cfs_boost,
+ .maxlen = sizeof(sysctl_sched_cfs_boost),
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+ .mode = 0444,
+#else
+ .mode = 0644,
+#endif
+ .proc_handler = &sysctl_sched_cfs_boost_handler,
+ .extra1 = &zero,
+ .extra2 = &one_hundred,
+ },
+#endif
#ifdef CONFIG_PROVE_LOCKING
{
.procname = "prove_locking",
@@ -1303,7 +1361,19 @@
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
+#ifdef CONFIG_INCREASE_MAXIMUM_SWAPPINESS
+ .extra2 = &max_swappiness,
+#else
.extra2 = &one_hundred,
+#endif
+ },
+ {
+ .procname = "mmap_readaround_limit",
+ .data = &mmap_readaround_limit,
+ .maxlen = sizeof(mmap_readaround_limit),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
},
#ifdef CONFIG_HUGETLB_PAGE
{
@@ -1397,6 +1467,14 @@
.extra1 = &zero,
},
{
+ .procname = "extra_free_kbytes",
+ .data = &extra_free_kbytes,
+ .maxlen = sizeof(extra_free_kbytes),
+ .mode = 0644,
+ .proc_handler = min_free_kbytes_sysctl_handler,
+ .extra1 = &zero,
+ },
+ {
.procname = "percpu_pagelist_fraction",
.data = &percpu_pagelist_fraction,
.maxlen = sizeof(percpu_pagelist_fraction),
@@ -1572,6 +1650,28 @@
.mode = 0644,
.proc_handler = proc_doulongvec_minmax,
},
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS
+ {
+ .procname = "mmap_rnd_bits",
+ .data = &mmap_rnd_bits,
+ .maxlen = sizeof(mmap_rnd_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_bits_min,
+ .extra2 = (void *)&mmap_rnd_bits_max,
+ },
+#endif
+#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS
+ {
+ .procname = "mmap_rnd_compat_bits",
+ .data = &mmap_rnd_compat_bits,
+ .maxlen = sizeof(mmap_rnd_compat_bits),
+ .mode = 0600,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = (void *)&mmap_rnd_compat_bits_min,
+ .extra2 = (void *)&mmap_rnd_compat_bits_max,
+ },
+#endif
{ }
};
diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c
old mode 100644
new mode 100755
diff --git a/kernel/task_work.c b/kernel/task_work.c
old mode 100644
new mode 100755
diff --git a/kernel/taskstats.c b/kernel/taskstats.c
old mode 100644
new mode 100755
diff --git a/kernel/test_kprobes.c b/kernel/test_kprobes.c
old mode 100644
new mode 100755
diff --git a/kernel/time/Kconfig b/kernel/time/Kconfig
old mode 100644
new mode 100755
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
old mode 100644
new mode 100755
diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c
old mode 100644
new mode 100755
index e78480b..6c7a5bc
--- a/kernel/time/alarmtimer.c
+++ b/kernel/time/alarmtimer.c
@@ -633,6 +633,59 @@
return 0;
}
+#if defined(CONFIG_RTC_ALARM_BOOT)
+#define BOOTALM_BIT_EN 0
+#define BOOTALM_BIT_YEAR 1
+#define BOOTALM_BIT_MONTH 5
+#define BOOTALM_BIT_DAY 7
+#define BOOTALM_BIT_HOUR 9
+#define BOOTALM_BIT_MIN 11
+#define BOOTALM_BIT_TOTAL 13
+
+int alarm_set_alarm_boot(char *alarm_data)
+{
+ struct rtc_wkalrm alm;
+ int ret;
+ char buf_ptr[BOOTALM_BIT_TOTAL + 1];
+
+ printk("alarm_set_alarm_boot: AlarmManager\n");
+
+ if (!rtcdev) {
+ printk("alarm_set_alarm_boot: no RTC, time will be lost on reboot\n");
+ return -1;
+ }
+
+ strlcpy(buf_ptr, alarm_data, BOOTALM_BIT_TOTAL + 1);
+
+ alm.time.tm_sec = 0;
+
+ alm.time.tm_min = (buf_ptr[BOOTALM_BIT_MIN] - '0') * 10
+ + (buf_ptr[BOOTALM_BIT_MIN + 1] - '0');
+ alm.time.tm_hour = (buf_ptr[BOOTALM_BIT_HOUR] - '0') * 10
+ + (buf_ptr[BOOTALM_BIT_HOUR + 1] - '0');
+ alm.time.tm_mday = (buf_ptr[BOOTALM_BIT_DAY] - '0') * 10
+ + (buf_ptr[BOOTALM_BIT_DAY + 1] - '0');
+ alm.time.tm_mon = (buf_ptr[BOOTALM_BIT_MONTH] - '0') * 10
+ + (buf_ptr[BOOTALM_BIT_MONTH + 1] - '0');
+ alm.time.tm_year = (buf_ptr[BOOTALM_BIT_YEAR] - '0') * 1000
+ + (buf_ptr[BOOTALM_BIT_YEAR + 1] - '0') * 100
+ + (buf_ptr[BOOTALM_BIT_YEAR + 2] - '0') * 10
+ + (buf_ptr[BOOTALM_BIT_YEAR + 3] - '0');
+ alm.enabled = (*buf_ptr == '1');
+
+ alm.time.tm_mon -= 1;
+ alm.time.tm_year -= 1900;
+
+ printk(KERN_INFO "%s: %d/%d/%d %d:%d:%d(%d)\n", __func__,
+ 1900 + alm.time.tm_year, 1 + alm.time.tm_mon, alm.time.tm_mday,
+ alm.time.tm_hour, alm.time.tm_min, alm.time.tm_sec, alm.time.tm_wday);
+
+ ret = rtc_set_alarm_boot(rtcdev, &alm);
+
+ return ret;
+}
+#endif
+
/**
* alarmtimer_nsleep_wakeup - Wakeup function for alarm_timer_nsleep
* @alarm: ptr to alarm that fired
diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
old mode 100644
new mode 100755
index a9b76a4..4d8b77d
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -17,6 +17,7 @@
#include <linux/module.h>
#include <linux/smp.h>
#include <linux/device.h>
+#include <linux/exynos-ss.h>
#include "tick-internal.h"
@@ -253,6 +254,7 @@
dev->retries++;
clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ exynos_ss_clockevent(clc, delta, &dev->next_event);
if (dev->set_next_event((unsigned long) clc, dev) == 0)
return 0;
@@ -290,6 +292,7 @@
dev->retries++;
clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ exynos_ss_clockevent(clc, delta, &dev->next_event);
return dev->set_next_event((unsigned long) clc, dev);
}
@@ -336,6 +339,7 @@
delta = max(delta, (int64_t) dev->min_delta_ns);
clc = ((unsigned long long) delta * dev->mult) >> dev->shift;
+ exynos_ss_clockevent(clc, delta, &dev->next_event);
rc = dev->set_next_event((unsigned long) clc, dev);
return (rc && force) ? clockevents_program_min_delta(dev) : rc;
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
old mode 100644
new mode 100755
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
old mode 100644
new mode 100755
index 8c4e27c..34f8de6f
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -49,6 +49,7 @@
#include <linux/sched/deadline.h>
#include <linux/timer.h>
#include <linux/freezer.h>
+#include <linux/exynos-ss.h>
#include <asm/uaccess.h>
@@ -200,6 +201,11 @@
}
#endif
+#ifdef CONFIG_SCHED_HMP
+extern struct cpumask hmp_fast_cpu_mask;
+extern struct cpumask hmp_slow_cpu_mask;
+#endif
+
/*
* We switch the timer base to a power-optimized selected CPU target,
* if:
@@ -220,8 +226,50 @@
struct hrtimer_clock_base *new_base;
int basenum = base->index;
+#ifdef CONFIG_SCHED_HMP
+ int this_cpu = smp_processor_id();
+ int cpu = get_nohz_timer_target();
+#endif
+
this_cpu_base = this_cpu_ptr(&hrtimer_bases);
new_cpu_base = get_target_base(this_cpu_base, pinned);
+
+#ifdef CONFIG_SCHED_HMP
+ /* Switch the timer base to boot cluster on HMP */
+ if (timer->bounded_to_boot_cluster &&
+ cpumask_test_cpu(this_cpu, &hmp_fast_cpu_mask) &&
+ !pinned && this_cpu_base->migration_enabled) {
+ int bound_cpu = 0;
+
+ if (unlikely(hrtimer_callback_running(timer)))
+ return base;
+
+ /* Use the nearest busy cpu to switch timer base
+ * from an idle cpu. */
+ for_each_cpu(cpu, &hmp_slow_cpu_mask) {
+ if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) {
+ bound_cpu = cpu;
+ break;
+ }
+ }
+
+ new_cpu_base = &per_cpu(hrtimer_bases, bound_cpu);
+ new_base = &new_cpu_base->clock_base[basenum];
+
+ /* See the comment in lock_timer_base() */
+ timer->base = NULL;
+ raw_spin_unlock(&base->cpu_base->lock);
+ raw_spin_lock(&new_base->cpu_base->lock);
+
+ base = timer->base = new_base;
+
+ raw_spin_unlock(&new_base->cpu_base->lock);
+ raw_spin_lock(&base->cpu_base->lock);
+
+ return new_base;
+ }
+#endif
+
again:
new_base = &new_cpu_base->clock_base[basenum];
@@ -435,6 +483,7 @@
{
debug_object_free(timer, &hrtimer_debug_descr);
}
+EXPORT_SYMBOL_GPL(destroy_hrtimer_on_stack);
#else
static inline void debug_hrtimer_init(struct hrtimer *timer) { }
@@ -986,7 +1035,7 @@
* relative (HRTIMER_MODE_REL)
*/
void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
- unsigned long delta_ns, const enum hrtimer_mode mode)
+ u64 delta_ns, const enum hrtimer_mode mode)
{
struct hrtimer_clock_base *base, *new_base;
unsigned long flags;
@@ -1257,7 +1306,9 @@
*/
raw_spin_unlock(&cpu_base->lock);
trace_hrtimer_expire_entry(timer, now);
+ exynos_ss_hrtimer(timer, &now->tv64, fn, ESS_FLAG_IN);
restart = fn(timer);
+ exynos_ss_hrtimer(timer, &now->tv64, fn, ESS_FLAG_OUT);
trace_hrtimer_expire_exit(timer);
raw_spin_lock(&cpu_base->lock);
@@ -1560,7 +1611,7 @@
struct restart_block *restart;
struct hrtimer_sleeper t;
int ret = 0;
- unsigned long slack;
+ u64 slack;
slack = current->timer_slack_ns;
if (dl_task(current) || rt_task(current))
@@ -1737,7 +1788,7 @@
* @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME
*/
int __sched
-schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta,
+schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode, int clock)
{
struct hrtimer_sleeper t;
@@ -1805,7 +1856,7 @@
*
* Returns 0 when the timer has expired otherwise -EINTR
*/
-int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta,
+int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta,
const enum hrtimer_mode mode)
{
return schedule_hrtimeout_range_clock(expires, delta, mode,
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
old mode 100644
new mode 100755
diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c
old mode 100644
new mode 100755
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
old mode 100644
new mode 100755
diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h
old mode 100644
new mode 100755
diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c
old mode 100644
new mode 100755
diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c
old mode 100644
new mode 100755
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
old mode 100644
new mode 100755
diff --git a/kernel/time/sched_clock.c b/kernel/time/sched_clock.c
old mode 100644
new mode 100755
diff --git a/kernel/time/test_udelay.c b/kernel/time/test_udelay.c
old mode 100644
new mode 100755
diff --git a/kernel/time/tick-broadcast-hrtimer.c b/kernel/time/tick-broadcast-hrtimer.c
old mode 100644
new mode 100755
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
old mode 100644
new mode 100755
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
old mode 100644
new mode 100755
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
old mode 100644
new mode 100755
diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c
old mode 100644
new mode 100755
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
old mode 100644
new mode 100755
index 5ad2e85..4cd1145
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -885,6 +885,18 @@
return ts->sleep_length;
}
+/**
+ * tick_nohz_get_idle_calls - return the current idle calls counter value
+ *
+ * Called from the schedutil frequency scaling governor in scheduler context.
+ */
+unsigned long tick_nohz_get_idle_calls(void)
+{
+ struct tick_sched *ts = this_cpu_ptr(&tick_cpu_sched);
+
+ return ts->idle_calls;
+}
+
static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
{
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h
old mode 100644
new mode 100755
diff --git a/kernel/time/time.c b/kernel/time/time.c
old mode 100644
new mode 100755
diff --git a/kernel/time/timeconst.bc b/kernel/time/timeconst.bc
old mode 100644
new mode 100755
diff --git a/kernel/time/timeconv.c b/kernel/time/timeconv.c
old mode 100644
new mode 100755
diff --git a/kernel/time/timecounter.c b/kernel/time/timecounter.c
old mode 100644
new mode 100755
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
old mode 100644
new mode 100755
index d9837d2..7d9067f
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -72,6 +72,10 @@
tk->tkr_mono.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_mono.shift;
tk->xtime_sec++;
}
+ while (tk->tkr_raw.xtime_nsec >= ((u64)NSEC_PER_SEC << tk->tkr_raw.shift)) {
+ tk->tkr_raw.xtime_nsec -= (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
+ tk->raw_sec++;
+ }
}
static inline struct timespec64 tk_xtime(struct timekeeper *tk)
@@ -284,12 +288,14 @@
/* if changing clocks, convert xtime_nsec shift units */
if (old_clock) {
int shift_change = clock->shift - old_clock->shift;
- if (shift_change < 0)
+ if (shift_change < 0) {
tk->tkr_mono.xtime_nsec >>= -shift_change;
- else
+ tk->tkr_raw.xtime_nsec >>= -shift_change;
+ } else {
tk->tkr_mono.xtime_nsec <<= shift_change;
+ tk->tkr_raw.xtime_nsec <<= shift_change;
+ }
}
- tk->tkr_raw.xtime_nsec = 0;
tk->tkr_mono.shift = clock->shift;
tk->tkr_raw.shift = clock->shift;
@@ -443,6 +449,35 @@
}
EXPORT_SYMBOL_GPL(ktime_get_raw_fast_ns);
+/**
+ * ktime_get_boot_fast_ns - NMI safe and fast access to boot clock.
+ *
+ * To keep it NMI safe since we're accessing from tracing, we're not using a
+ * separate timekeeper with updates to monotonic clock and boot offset
+ * protected with seqlocks. This has the following minor side effects:
+ *
+ * (1) Its possible that a timestamp be taken after the boot offset is updated
+ * but before the timekeeper is updated. If this happens, the new boot offset
+ * is added to the old timekeeping making the clock appear to update slightly
+ * earlier:
+ * CPU 0 CPU 1
+ * timekeeping_inject_sleeptime64()
+ * __timekeeping_inject_sleeptime(tk, delta);
+ * timestamp();
+ * timekeeping_update(tk, TK_CLEAR_NTP...);
+ *
+ * (2) On 32-bit systems, the 64-bit boot offset (tk->offs_boot) may be
+ * partially updated. Since the tk->offs_boot update is a rare event, this
+ * should be a rare occurrence which postprocessing should be able to handle.
+ */
+u64 notrace ktime_get_boot_fast_ns(void)
+{
+ struct timekeeper *tk = &tk_core.timekeeper;
+
+ return (ktime_get_mono_fast_ns() + ktime_to_ns(tk->offs_boot));
+}
+EXPORT_SYMBOL_GPL(ktime_get_boot_fast_ns);
+
/* Suspend-time cycles value for halted fast timekeeper. */
static cycle_t cycles_at_suspend;
@@ -589,9 +624,6 @@
nsec = (u32) tk->wall_to_monotonic.tv_nsec;
tk->tkr_mono.base = ns_to_ktime(seconds * NSEC_PER_SEC + nsec);
- /* Update the monotonic raw base */
- tk->tkr_raw.base = timespec64_to_ktime(tk->raw_time);
-
/*
* The sum of the nanoseconds portions of xtime and
* wall_to_monotonic can be greater/equal one second. Take
@@ -601,6 +633,9 @@
if (nsec >= NSEC_PER_SEC)
seconds++;
tk->ktime_sec = seconds;
+
+ /* Update the monotonic raw base */
+ tk->tkr_raw.base = ns_to_ktime(tk->raw_sec * NSEC_PER_SEC);
}
/* must hold timekeeper_lock */
@@ -642,7 +677,6 @@
static void timekeeping_forward_now(struct timekeeper *tk)
{
cycle_t cycle_now, delta;
- s64 nsec;
cycle_now = tk_clock_read(&tk->tkr_mono);
delta = clocksource_delta(cycle_now, tk->tkr_mono.cycle_last, tk->tkr_mono.mask);
@@ -654,10 +688,13 @@
/* If arch requires, add in get_arch_timeoffset() */
tk->tkr_mono.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_mono.shift;
- tk_normalize_xtime(tk);
- nsec = clocksource_cyc2ns(delta, tk->tkr_raw.mult, tk->tkr_raw.shift);
- timespec64_add_ns(&tk->raw_time, nsec);
+ tk->tkr_raw.xtime_nsec += delta * tk->tkr_raw.mult;
+
+ /* If arch requires, add in get_arch_timeoffset() */
+ tk->tkr_raw.xtime_nsec += (u64)arch_gettimeoffset() << tk->tkr_raw.shift;
+
+ tk_normalize_xtime(tk);
}
/**
@@ -1151,19 +1188,18 @@
void getrawmonotonic64(struct timespec64 *ts)
{
struct timekeeper *tk = &tk_core.timekeeper;
- struct timespec64 ts64;
unsigned long seq;
s64 nsecs;
do {
seq = read_seqcount_begin(&tk_core.seq);
+ ts->tv_sec = tk->raw_sec;
nsecs = timekeeping_get_ns(&tk->tkr_raw);
- ts64 = tk->raw_time;
} while (read_seqcount_retry(&tk_core.seq, seq));
- timespec64_add_ns(&ts64, nsecs);
- *ts = ts64;
+ ts->tv_nsec = 0;
+ timespec64_add_ns(ts, nsecs);
}
EXPORT_SYMBOL(getrawmonotonic64);
@@ -1287,8 +1323,7 @@
tk_setup_internals(tk, clock);
tk_set_xtime(tk, &now);
- tk->raw_time.tv_sec = 0;
- tk->raw_time.tv_nsec = 0;
+ tk->raw_sec = 0;
if (boot.tv_sec == 0 && boot.tv_nsec == 0)
boot = tk_xtime(tk);
@@ -1783,15 +1818,12 @@
*clock_set |= accumulate_nsecs_to_secs(tk);
/* Accumulate raw time */
- tk->tkr_raw.xtime_nsec += (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
tk->tkr_raw.xtime_nsec += tk->raw_interval << shift;
snsec_per_sec = (u64)NSEC_PER_SEC << tk->tkr_raw.shift;
while (tk->tkr_raw.xtime_nsec >= snsec_per_sec) {
tk->tkr_raw.xtime_nsec -= snsec_per_sec;
- tk->raw_time.tv_sec++;
+ tk->raw_sec++;
}
- tk->raw_time.tv_nsec = tk->tkr_raw.xtime_nsec >> tk->tkr_raw.shift;
- tk->tkr_raw.xtime_nsec -= (u64)tk->raw_time.tv_nsec << tk->tkr_raw.shift;
/* Accumulate error between NTP and clock interval */
tk->ntp_error += tk->ntp_tick << shift;
diff --git a/kernel/time/timekeeping.h b/kernel/time/timekeeping.h
old mode 100644
new mode 100755
diff --git a/kernel/time/timekeeping_debug.c b/kernel/time/timekeeping_debug.c
old mode 100644
new mode 100755
diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h
old mode 100644
new mode 100755
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
old mode 100644
new mode 100755
index 3d7588a..20ed693
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -1182,7 +1182,9 @@
lock_map_acquire(&lockdep_map);
trace_timer_expire_entry(timer);
+ exynos_ss_irq(ESS_FLAG_CALL_TIMER_FN, fn, irqs_disabled(), ESS_FLAG_IN);
fn(data);
+ exynos_ss_irq(ESS_FLAG_CALL_TIMER_FN, fn, irqs_disabled(), ESS_FLAG_OUT);
trace_timer_expire_exit(timer);
lock_map_release(&lockdep_map);
@@ -1705,10 +1707,10 @@
static void __sched do_usleep_range(unsigned long min, unsigned long max)
{
ktime_t kmin;
- unsigned long delta;
+ u64 delta;
kmin = ktime_set(0, min * NSEC_PER_USEC);
- delta = (max - min) * NSEC_PER_USEC;
+ delta = (u64)(max - min) * NSEC_PER_USEC;
schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL);
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
old mode 100644
new mode 100755
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
old mode 100644
new mode 100755
diff --git a/kernel/torture.c b/kernel/torture.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
old mode 100644
new mode 100755
index e45db6b..006eefb
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -77,6 +77,9 @@
select CONTEXT_SWITCH_TRACER
bool
+config GPU_TRACEPOINTS
+ bool
+
config CONTEXT_SWITCH_TRACER
bool
@@ -162,6 +165,17 @@
address on the current task structure into a stack of calls.
+config PREEMPTIRQ_EVENTS
+ bool "Enable trace events for preempt and irq disable/enable"
+ select TRACE_IRQFLAGS
+ depends on DEBUG_PREEMPT || !PROVE_LOCKING
+ default n
+ help
+ Enable tracing of disable and enable events for preemption and irqs.
+ For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+ enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+ be disabled.
+
config IRQSOFF_TRACER
bool "Interrupts-off Latency Tracer"
default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
old mode 100644
new mode 100755
index 05ea516..4b35fb9
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -37,6 +37,7 @@
obj-$(CONFIG_TRACING) += trace_printk.o
obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
@@ -68,6 +69,7 @@
endif
obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
obj-$(CONFIG_UPROBE_EVENT) += trace_uprobe.o
+obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o
obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
old mode 100644
new mode 100755
index 210b8e7..faca524
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1789,6 +1789,8 @@
}
}
+SIO_PATCH_VERSION(ftrace_discard_bugfix, 1, 0, "");
+
void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
{
int i = 0;
@@ -1796,13 +1798,13 @@
if (rw & REQ_FLUSH)
rwbs[i++] = 'F';
- if (rw & WRITE)
- rwbs[i++] = 'W';
- else if (rw & REQ_DISCARD)
- rwbs[i++] = 'D';
- else if (bytes)
- rwbs[i++] = 'R';
- else
+ if (rw & REQ_DISCARD)
+ rwbs[i++] = 'D';
+ else if (rw & WRITE)
+ rwbs[i++] = 'W';
+ else if (bytes)
+ rwbs[i++] = 'R';
+ else
rwbs[i++] = 'N';
if (rw & REQ_FUA)
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/gpu-traces.c b/kernel/trace/gpu-traces.c
new file mode 100755
index 0000000..a4b3f00
--- /dev/null
+++ b/kernel/trace/gpu-traces.c
@@ -0,0 +1,23 @@
+/*
+ * GPU tracepoints
+ *
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/gpu.h>
+
+EXPORT_TRACEPOINT_SYMBOL(gpu_sched_switch);
+EXPORT_TRACEPOINT_SYMBOL(gpu_job_enqueue);
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
old mode 100644
new mode 100755
index eb4220a..1a87369
--- a/kernel/trace/power-traces.c
+++ b/kernel/trace/power-traces.c
@@ -15,4 +15,5 @@
EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume);
EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle);
+EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_frequency);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
old mode 100644
new mode 100755
index 74b20e3..65b12a4
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1154,7 +1154,7 @@
* not destabilized.
*/
bpage = kzalloc_node(ALIGN(sizeof(*bpage), cache_line_size()),
- GFP_KERNEL | __GFP_NORETRY,
+ GFP_KERNEL,
cpu_to_node(cpu));
if (!bpage)
goto free_pages;
@@ -1162,7 +1162,7 @@
list_add(&bpage->list, pages);
page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
+ GFP_KERNEL, 0);
if (!page)
goto free_pages;
bpage->page = page_address(page);
@@ -4450,7 +4450,7 @@
struct page *page;
page = alloc_pages_node(cpu_to_node(cpu),
- GFP_KERNEL | __GFP_NORETRY, 0);
+ GFP_KERNEL, 0);
if (!page)
return NULL;
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/rpm-traces.c b/kernel/trace/rpm-traces.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
old mode 100644
new mode 100755
index 8c097de..2ff223a
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -890,6 +890,7 @@
{ trace_clock, "perf", 1 },
{ ktime_get_mono_fast_ns, "mono", 1 },
{ ktime_get_raw_fast_ns, "mono_raw", 1 },
+ { ktime_get_boot_fast_ns, "boot", 1 },
ARCH_TRACE_CLOCKS
};
@@ -1362,6 +1363,7 @@
struct saved_cmdlines_buffer {
unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
unsigned *map_cmdline_to_pid;
+ unsigned *map_cmdline_to_tgid;
unsigned cmdline_num;
int cmdline_idx;
char *saved_cmdlines;
@@ -1395,12 +1397,23 @@
return -ENOMEM;
}
+ s->map_cmdline_to_tgid = kmalloc_array(val,
+ sizeof(*s->map_cmdline_to_tgid),
+ GFP_KERNEL);
+ if (!s->map_cmdline_to_tgid) {
+ kfree(s->map_cmdline_to_pid);
+ kfree(s->saved_cmdlines);
+ return -ENOMEM;
+ }
+
s->cmdline_idx = 0;
s->cmdline_num = val;
memset(&s->map_pid_to_cmdline, NO_CMDLINE_MAP,
sizeof(s->map_pid_to_cmdline));
memset(s->map_cmdline_to_pid, NO_CMDLINE_MAP,
val * sizeof(*s->map_cmdline_to_pid));
+ memset(s->map_cmdline_to_tgid, NO_CMDLINE_MAP,
+ val * sizeof(*s->map_cmdline_to_tgid));
return 0;
}
@@ -1566,14 +1579,17 @@
if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
return 0;
+ preempt_disable();
/*
* It's not the end of the world if we don't get
* the lock, but we also don't want to spin
* nor do we want to disable interrupts,
* so if we miss here, then better luck next time.
*/
- if (!arch_spin_trylock(&trace_cmdline_lock))
+ if (!arch_spin_trylock(&trace_cmdline_lock)) {
+ preempt_enable();
return 0;
+ }
idx = savedcmd->map_pid_to_cmdline[tsk->pid];
if (idx == NO_CMDLINE_MAP) {
@@ -1596,8 +1612,9 @@
}
set_cmdline(idx, tsk->comm);
-
+ savedcmd->map_cmdline_to_tgid[idx] = tsk->tgid;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
return 1;
}
@@ -1639,6 +1656,35 @@
preempt_enable();
}
+static int __find_tgid_locked(int pid)
+{
+ unsigned map;
+ int tgid;
+
+ map = savedcmd->map_pid_to_cmdline[pid];
+ if (map != NO_CMDLINE_MAP)
+ tgid = savedcmd->map_cmdline_to_tgid[map];
+ else
+ tgid = -1;
+
+ return tgid;
+}
+
+int trace_find_tgid(int pid)
+{
+ int tgid;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ tgid = __find_tgid_locked(pid);
+
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ return tgid;
+}
+
void tracing_record_cmdline(struct task_struct *tsk)
{
if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
@@ -2611,6 +2657,13 @@
"# | | | | |\n");
}
+static void print_func_help_header_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# TASK-PID TGID CPU# TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | | |\n");
+}
+
static void print_func_help_header_irq(struct trace_buffer *buf, struct seq_file *m)
{
print_event_info(buf, m);
@@ -2623,6 +2676,18 @@
"# | | | |||| | |\n");
}
+static void print_func_help_header_irq_tgid(struct trace_buffer *buf, struct seq_file *m)
+{
+ print_event_info(buf, m);
+ seq_puts(m, "# _-----=> irqs-off\n");
+ seq_puts(m, "# / _----=> need-resched\n");
+ seq_puts(m, "# | / _---=> hardirq/softirq\n");
+ seq_puts(m, "# || / _--=> preempt-depth\n");
+ seq_puts(m, "# ||| / delay\n");
+ seq_puts(m, "# TASK-PID TGID CPU# |||| TIMESTAMP FUNCTION\n");
+ seq_puts(m, "# | | | | |||| | |\n");
+}
+
void
print_trace_header(struct seq_file *m, struct trace_iterator *iter)
{
@@ -2696,13 +2761,14 @@
if (!(iter->iter_flags & TRACE_FILE_ANNOTATE))
return;
- if (iter->started && cpumask_test_cpu(iter->cpu, iter->started))
+ if (cpumask_available(iter->started) &&
+ cpumask_test_cpu(iter->cpu, iter->started))
return;
if (per_cpu_ptr(iter->trace_buffer->data, iter->cpu)->skipped_entries)
return;
- if (iter->started)
+ if (cpumask_available(iter->started))
cpumask_set_cpu(iter->cpu, iter->started);
/* Don't print started cpu buffer for the first entry of the trace */
@@ -2935,9 +3001,15 @@
} else {
if (!(trace_flags & TRACE_ITER_VERBOSE)) {
if (trace_flags & TRACE_ITER_IRQ_INFO)
- print_func_help_header_irq(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_irq_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header_irq(iter->trace_buffer, m);
else
- print_func_help_header(iter->trace_buffer, m);
+ if (trace_flags & TRACE_ITER_TGID)
+ print_func_help_header_tgid(iter->trace_buffer, m);
+ else
+ print_func_help_header(iter->trace_buffer, m);
}
}
}
@@ -3940,10 +4012,15 @@
{
char buf[64];
int r;
+ unsigned int n;
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
- r = scnprintf(buf, sizeof(buf), "%u\n", savedcmd->cmdline_num);
+ n = savedcmd->cmdline_num;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ r = scnprintf(buf, sizeof(buf), "%u\n", n);
return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
}
@@ -3952,6 +4029,7 @@
{
kfree(s->saved_cmdlines);
kfree(s->map_cmdline_to_pid);
+ kfree(s->map_cmdline_to_tgid);
kfree(s);
}
@@ -3968,10 +4046,12 @@
return -ENOMEM;
}
+ preempt_disable();
arch_spin_lock(&trace_cmdline_lock);
savedcmd_temp = savedcmd;
savedcmd = s;
arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
free_saved_cmdlines_buffer(savedcmd_temp);
return 0;
@@ -4184,6 +4264,78 @@
}
static ssize_t
+tracing_saved_tgids_read(struct file *file, char __user *ubuf,
+ size_t cnt, loff_t *ppos)
+{
+ char *file_buf;
+ char *buf;
+ int len = 0;
+ int i;
+ int *pids;
+ int n = 0;
+
+ preempt_disable();
+ arch_spin_lock(&trace_cmdline_lock);
+
+ pids = kmalloc_array(savedcmd->cmdline_num, 2*sizeof(int), GFP_KERNEL);
+ if (!pids) {
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < savedcmd->cmdline_num; i++) {
+ int pid;
+
+ pid = savedcmd->map_cmdline_to_pid[i];
+ if (pid == -1 || pid == NO_CMDLINE_MAP)
+ continue;
+
+ pids[n] = pid;
+ pids[n+1] = __find_tgid_locked(pid);
+ n += 2;
+ }
+ arch_spin_unlock(&trace_cmdline_lock);
+ preempt_enable();
+
+ if (n == 0) {
+ kfree(pids);
+ return 0;
+ }
+
+ /* enough to hold max pair of pids + space, lr and nul */
+ len = n * 12;
+ file_buf = kmalloc(len, GFP_KERNEL);
+ if (!file_buf) {
+ kfree(pids);
+ return -ENOMEM;
+ }
+
+ buf = file_buf;
+ for (i = 0; i < n && len > 0; i += 2) {
+ int r;
+
+ r = snprintf(buf, len, "%d %d\n", pids[i], pids[i+1]);
+ buf += r;
+ len -= r;
+ }
+
+ len = simple_read_from_buffer(ubuf, cnt, ppos,
+ file_buf, buf - file_buf);
+
+ kfree(file_buf);
+ kfree(pids);
+
+ return len;
+}
+
+static const struct file_operations tracing_saved_tgids_fops = {
+ .open = tracing_open_generic,
+ .read = tracing_saved_tgids_read,
+ .llseek = generic_file_llseek,
+};
+
+static ssize_t
tracing_set_trace_read(struct file *filp, char __user *ubuf,
size_t cnt, loff_t *ppos)
{
@@ -5047,7 +5199,7 @@
return ret;
/* must have at least 1 entry */
- if (!val)
+ if (!val || val > 32768)
return -EINVAL;
/* value is in KB */
@@ -6812,6 +6964,9 @@
trace_create_file("trace_marker", 0220, d_tracer,
tr, &tracing_mark_fops);
+ trace_create_file("saved_tgids", 0444, d_tracer,
+ tr, &tracing_saved_tgids_fops);
+
trace_create_file("trace_clock", 0644, d_tracer, tr,
&trace_clock_fops);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
old mode 100644
new mode 100755
index 919d9d0..e1265f9
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -656,6 +656,7 @@
extern cycle_t ftrace_now(int cpu);
extern void trace_find_cmdline(int pid, char comm[]);
+extern int trace_find_tgid(int pid);
#ifdef CONFIG_DYNAMIC_FTRACE
extern unsigned long ftrace_update_tot_cnt;
@@ -970,7 +971,8 @@
FUNCTION_FLAGS \
FGRAPH_FLAGS \
STACK_FLAGS \
- BRANCH_FLAGS
+ BRANCH_FLAGS \
+ C(TGID, "print-tgid"),
/*
* By defining C, we can make TRACE_FLAGS a list of bit names
diff --git a/kernel/trace/trace_benchmark.c b/kernel/trace/trace_benchmark.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_benchmark.h b/kernel/trace/trace_benchmark.h
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_events_filter_test.h b/kernel/trace/trace_events_filter_test.h
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
old mode 100644
new mode 100755
index e212ec4..55002f2
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -8,6 +8,7 @@
*/
#include <linux/uaccess.h>
#include <linux/ftrace.h>
+#include <linux/interrupt.h>
#include <linux/slab.h>
#include <linux/fs.h>
@@ -64,6 +65,9 @@
#define TRACE_GRAPH_INDENT 2
+/* Flag options */
+#define TRACE_GRAPH_PRINT_FLAT 0x80
+
static unsigned int max_depth;
static struct tracer_opt trace_opts[] = {
@@ -87,6 +91,8 @@
{ TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
/* Include time within nested functions */
{ TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
+ /* Use standard trace formatting rather than hierarchical */
+ { TRACER_OPT(funcgraph-flat, TRACE_GRAPH_PRINT_FLAT) },
{ } /* Empty entry */
};
@@ -1179,6 +1185,9 @@
int cpu = iter->cpu;
int ret;
+ if (flags & TRACE_GRAPH_PRINT_FLAT)
+ return TRACE_TYPE_UNHANDLED;
+
if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
return TRACE_TYPE_HANDLED;
@@ -1236,13 +1245,6 @@
return print_graph_function_flags(iter, tracer_flags.val);
}
-static enum print_line_t
-print_graph_function_event(struct trace_iterator *iter, int flags,
- struct trace_event *event)
-{
- return print_graph_function(iter);
-}
-
static void print_lat_header(struct seq_file *s, u32 flags)
{
static const char spaces[] = " " /* 16 spaces */
@@ -1311,6 +1313,11 @@
struct trace_iterator *iter = s->private;
struct trace_array *tr = iter->tr;
+ if (flags & TRACE_GRAPH_PRINT_FLAT) {
+ trace_default_header(s);
+ return;
+ }
+
if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
return;
@@ -1392,19 +1399,6 @@
return 0;
}
-static struct trace_event_functions graph_functions = {
- .trace = print_graph_function_event,
-};
-
-static struct trace_event graph_trace_entry_event = {
- .type = TRACE_GRAPH_ENT,
- .funcs = &graph_functions,
-};
-
-static struct trace_event graph_trace_ret_event = {
- .type = TRACE_GRAPH_RET,
- .funcs = &graph_functions
-};
static struct tracer graph_trace __tracer_data = {
.name = "function_graph",
@@ -1481,16 +1475,6 @@
{
max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1);
- if (!register_trace_event(&graph_trace_entry_event)) {
- pr_warning("Warning: could not register graph trace events\n");
- return 1;
- }
-
- if (!register_trace_event(&graph_trace_ret_event)) {
- pr_warning("Warning: could not register graph trace events\n");
- return 1;
- }
-
return register_tracer(&graph_trace);
}
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
old mode 100644
new mode 100755
index be3222b..21b162c
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,10 @@
#include "trace.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
static struct trace_array *irqsoff_trace __read_mostly;
static int tracer_enabled __read_mostly;
@@ -451,63 +455,43 @@
#else /* !CONFIG_PROVE_LOCKING */
/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
-/*
* We are only interested in hardirq on/off events:
*/
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_on);
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
}
-EXPORT_SYMBOL(trace_hardirqs_off);
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
stop_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
{
if (!preempt_trace() && irq_trace())
start_critical_timing(CALLER_ADDR0, caller_addr);
}
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
#endif /* CONFIG_PROVE_LOCKING */
#endif /* CONFIG_IRQSOFF_TRACER */
#ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
stop_critical_timing(a0, a1);
}
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
{
if (preempt_trace() && !irq_trace())
start_critical_timing(a0, a1);
@@ -770,3 +754,100 @@
return 0;
}
core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+void trace_hardirqs_on(void)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_on();
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+ tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+ if (!this_cpu_read(tracing_irq_cpu))
+ return;
+
+ trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_on_caller(caller_addr);
+
+ this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+ if (this_cpu_read(tracing_irq_cpu))
+ return;
+
+ this_cpu_write(tracing_irq_cpu, 1);
+
+ trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+ tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACER) || \
+ (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_enable_rcuidle(a0, a1);
+ tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+ trace_preempt_disable_rcuidle(a0, a1);
+ tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_kdb.c b/kernel/trace/trace_kdb.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
old mode 100644
new mode 100755
index 2829821..3bc4b6d
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -526,11 +526,21 @@
unsigned long long t;
unsigned long secs, usec_rem;
char comm[TASK_COMM_LEN];
+ int tgid;
trace_find_cmdline(entry->pid, comm);
- trace_seq_printf(s, "%16s-%-5d [%03d] ",
- comm, entry->pid, iter->cpu);
+ trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
+
+ if (tr->trace_flags & TRACE_ITER_TGID) {
+ tgid = trace_find_tgid(entry->pid);
+ if (tgid < 0)
+ trace_seq_puts(s, "(-----) ");
+ else
+ trace_seq_printf(s, "(%5d) ", tgid);
+ }
+
+ trace_seq_printf(s, "[%03d] ", iter->cpu);
if (tr->trace_flags & TRACE_ITER_IRQ_INFO)
trace_print_lat_fmt(s, entry);
@@ -845,6 +855,174 @@
.funcs = &trace_fn_funcs,
};
+/* TRACE_GRAPH_ENT */
+static enum print_line_t trace_graph_ent_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_puts(s, "graph_ent: func=");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->graph_ent.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ trace_seq_puts(s, "\n");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %d\n",
+ field->graph_ent.func,
+ field->graph_ent.depth);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->graph_ent.func);
+ SEQ_PUT_HEX_FIELD(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ent_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->graph_ent.func);
+ SEQ_PUT_FIELD(s, field->graph_ent.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ent_funcs = {
+ .trace = trace_graph_ent_trace,
+ .raw = trace_graph_ent_raw,
+ .hex = trace_graph_ent_hex,
+ .binary = trace_graph_ent_bin,
+};
+
+static struct trace_event trace_graph_ent_event = {
+ .type = TRACE_GRAPH_ENT,
+ .funcs = &trace_graph_ent_funcs,
+};
+
+/* TRACE_GRAPH_RET */
+static enum print_line_t trace_graph_ret_trace(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct trace_seq *s = &iter->seq;
+ struct trace_entry *entry = iter->ent;
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, entry);
+
+ trace_seq_puts(s, "graph_ret: func=");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ if (!seq_print_ip_sym(s, field->ret.func, flags))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ trace_seq_puts(s, "\n");
+ if (trace_seq_has_overflowed(s))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_raw(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+
+ trace_assign_type(field, iter->ent);
+
+ trace_seq_printf(&iter->seq, "%lx %lld %lld %ld %d\n",
+ field->ret.func,
+ field->ret.calltime,
+ field->ret.rettime,
+ field->ret.overrun,
+ field->ret.depth);
+ if (trace_seq_has_overflowed(&iter->seq))
+ return TRACE_TYPE_PARTIAL_LINE;
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_hex(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_HEX_FIELD(s, field->ret.func);
+ SEQ_PUT_HEX_FIELD(s, field->ret.calltime);
+ SEQ_PUT_HEX_FIELD(s, field->ret.rettime);
+ SEQ_PUT_HEX_FIELD(s, field->ret.overrun);
+ SEQ_PUT_HEX_FIELD(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_bin(struct trace_iterator *iter, int flags,
+ struct trace_event *event)
+{
+ struct ftrace_graph_ret_entry *field;
+ struct trace_seq *s = &iter->seq;
+
+ trace_assign_type(field, iter->ent);
+
+ SEQ_PUT_FIELD(s, field->ret.func);
+ SEQ_PUT_FIELD(s, field->ret.calltime);
+ SEQ_PUT_FIELD(s, field->ret.rettime);
+ SEQ_PUT_FIELD(s, field->ret.overrun);
+ SEQ_PUT_FIELD(s, field->ret.depth);
+
+ return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ret_funcs = {
+ .trace = trace_graph_ret_trace,
+ .raw = trace_graph_ret_raw,
+ .hex = trace_graph_ret_hex,
+ .binary = trace_graph_ret_bin,
+};
+
+static struct trace_event trace_graph_ret_event = {
+ .type = TRACE_GRAPH_RET,
+ .funcs = &trace_graph_ret_funcs,
+};
+
/* TRACE_CTX an TRACE_WAKE */
static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
char *delim)
@@ -1222,6 +1400,8 @@
static struct trace_event *events[] __initdata = {
&trace_fn_event,
+ &trace_graph_ent_event,
+ &trace_graph_ret_event,
&trace_ctx_event,
&trace_wake_event,
&trace_stack_event,
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
old mode 100644
new mode 100755
index ad1d616..e82cff5
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -304,7 +304,7 @@
if (!*fmt)
return 0;
- seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt);
+ seq_printf(m, "0x%lx : \"", 0L);
/*
* Tabs and new lines need to be converted.
diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_probe.h b/kernel/trace/trace_probe.h
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_selftest_dynamic.c b/kernel/trace/trace_selftest_dynamic.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_seq.c b/kernel/trace/trace_seq.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
old mode 100644
new mode 100755
diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c
old mode 100644
new mode 100755
diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
old mode 100644
new mode 100755
diff --git a/kernel/tsacct.c b/kernel/tsacct.c
old mode 100644
new mode 100755
diff --git a/kernel/uid16.c b/kernel/uid16.c
old mode 100644
new mode 100755
diff --git a/kernel/up.c b/kernel/up.c
old mode 100644
new mode 100755
diff --git a/kernel/user-return-notifier.c b/kernel/user-return-notifier.c
old mode 100644
new mode 100755
diff --git a/kernel/user.c b/kernel/user.c
old mode 100644
new mode 100755
index b069ccb..41e94e4
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -16,6 +16,7 @@
#include <linux/interrupt.h>
#include <linux/export.h>
#include <linux/user_namespace.h>
+#include <linux/proc_fs.h>
#include <linux/proc_ns.h>
/*
@@ -201,6 +202,7 @@
}
spin_unlock_irq(&uidhash_lock);
}
+ proc_register_uid(uid);
return up;
@@ -222,6 +224,7 @@
spin_lock_irq(&uidhash_lock);
uid_hash_insert(&root_user, uidhashentry(GLOBAL_ROOT_UID));
spin_unlock_irq(&uidhash_lock);
+ proc_register_uid(GLOBAL_ROOT_UID);
return 0;
}
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
old mode 100644
new mode 100755
diff --git a/kernel/utsname.c b/kernel/utsname.c
old mode 100644
new mode 100755
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
old mode 100644
new mode 100755
diff --git a/kernel/watchdog.c b/kernel/watchdog.c
old mode 100644
new mode 100755
index c1e0b5f..0270467
--- a/kernel/watchdog.c
+++ b/kernel/watchdog.c
@@ -25,6 +25,22 @@
#include <linux/kvm_para.h>
#include <linux/perf_event.h>
#include <linux/kthread.h>
+#include <linux/sec_debug.h>
+
+#include <linux/exynos-ss.h>
+#include <linux/irqflags.h>
+
+#ifdef CONFIG_SEC_DEBUG
+static const char * const hl_to_name[] = {
+ "NONE", "TASK STUCK", "IRQ STUCK",
+ "IDLE STUCK", "SMCCALL STUCK", "IRQ STORM",
+ "HRTIMER ERROR", "UNKNOWN STUCK"
+};
+
+static const char * const sl_to_name[] = {
+ "NONE", "SOFTIRQ STUCK", "TASK STUCK", "UNKNOWN STUCK"
+};
+#endif
/*
* The run state of the lockup detectors is controlled by the content of the
@@ -90,8 +106,10 @@
static int __read_mostly watchdog_suspended;
static u64 __read_mostly sample_period;
+static unsigned long __read_mostly hardlockup_thresh;
static DEFINE_PER_CPU(unsigned long, watchdog_touch_ts);
+static DEFINE_PER_CPU(unsigned long, hardlockup_touch_ts);
static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog);
static DEFINE_PER_CPU(struct hrtimer, watchdog_hrtimer);
static DEFINE_PER_CPU(bool, softlockup_touch_sync);
@@ -103,10 +121,27 @@
static DEFINE_PER_CPU(bool, hard_watchdog_warn);
static DEFINE_PER_CPU(bool, watchdog_nmi_touch);
static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved);
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static cpumask_t __read_mostly watchdog_cpus;
+ATOMIC_NOTIFIER_HEAD(hardlockup_notifier_list);
+EXPORT_SYMBOL(hardlockup_notifier_list);
+#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static DEFINE_PER_CPU(struct perf_event *, watchdog_ev);
#endif
static unsigned long soft_lockup_nmi_warn;
+#ifdef CONFIG_SEC_DEBUG
+static DEFINE_PER_CPU(struct softlockup_info, percpu_sl_info);
+static void check_softlockup_type(void);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static DEFINE_PER_CPU(struct hardlockup_info, percpu_hl_info);
+static void check_hardlockup_type(unsigned int cpu);
+#endif
+#endif
+
/* boot commands */
/*
* Should we panic when a soft-lockup or hard-lockup occurs:
@@ -114,7 +149,7 @@
#ifdef CONFIG_HARDLOCKUP_DETECTOR
unsigned int __read_mostly hardlockup_panic =
CONFIG_BOOTPARAM_HARDLOCKUP_PANIC_VALUE;
-static unsigned long hardlockup_allcpu_dumped;
+static unsigned long __maybe_unused hardlockup_allcpu_dumped;
/*
* We may not want to enable hard lockup detection by default in all cases,
* for example when running the kernel as a guest on a hypervisor. In these
@@ -217,12 +252,14 @@
* hardlockup detector generates a warning
*/
sample_period = get_softlockup_thresh() * ((u64)NSEC_PER_SEC / 5);
+ hardlockup_thresh = sample_period * 3 / NSEC_PER_SEC;
}
/* Commands for resetting the watchdog */
static void __touch_watchdog(void)
{
__this_cpu_write(watchdog_touch_ts, get_timestamp());
+ __this_cpu_write(hardlockup_touch_ts, get_timestamp());
}
void touch_softlockup_watchdog(void)
@@ -271,7 +308,7 @@
__this_cpu_write(watchdog_touch_ts, 0);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
/* watchdog detector functions */
static bool is_hardlockup(void)
{
@@ -285,6 +322,88 @@
}
#endif
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static unsigned int watchdog_next_cpu(unsigned int cpu)
+{
+ cpumask_t cpus = watchdog_cpus;
+ unsigned int next_cpu;
+
+ next_cpu = cpumask_next(cpu, &cpus);
+ if (next_cpu >= nr_cpu_ids)
+ next_cpu = cpumask_first(&cpus);
+
+ if (next_cpu == cpu)
+ return nr_cpu_ids;
+
+ return next_cpu;
+}
+
+static int is_hardlockup_other_cpu(unsigned int cpu)
+{
+ unsigned long hrint = per_cpu(hrtimer_interrupts, cpu);
+
+ if (per_cpu(hrtimer_interrupts_saved, cpu) == hrint) {
+ unsigned long now = get_timestamp();
+ unsigned long touch_ts = per_cpu(hardlockup_touch_ts, cpu);
+
+ if (time_after(now, touch_ts) &&
+ (now - touch_ts >= hardlockup_thresh))
+ return 1;
+ }
+
+ per_cpu(hrtimer_interrupts_saved, cpu) = hrint;
+ return 0;
+}
+
+static void watchdog_check_hardlockup_other_cpu(void)
+{
+ unsigned int next_cpu;
+
+ /*
+ * Test for hardlockups every 3 samples. The sample period is
+ * watchdog_thresh * 2 / 5, so 3 samples gets us back to slightly over
+ * watchdog_thresh (over by 20%).
+ */
+ if (__this_cpu_read(hrtimer_interrupts) % 3 != 0)
+ return;
+
+ /* check for a hardlockup on the next cpu */
+ next_cpu = watchdog_next_cpu(smp_processor_id());
+ if (next_cpu >= nr_cpu_ids)
+ return;
+
+ smp_rmb();
+
+ if (per_cpu(watchdog_nmi_touch, next_cpu) == true) {
+ per_cpu(watchdog_nmi_touch, next_cpu) = false;
+ return;
+ }
+
+ if (is_hardlockup_other_cpu(next_cpu)) {
+#ifdef CONFIG_SEC_DEBUG
+ check_hardlockup_type(next_cpu);
+#endif
+ /* only warn once */
+ if (per_cpu(hard_watchdog_warn, next_cpu) == true)
+ return;
+
+ if (hardlockup_panic) {
+ exynos_ss_set_hardlockup(hardlockup_panic);
+ atomic_notifier_call_chain(&hardlockup_notifier_list, 0, (void *)&next_cpu);
+ panic("Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+ } else {
+ WARN(1, "Watchdog detected hard LOCKUP on cpu %u", next_cpu);
+ }
+
+ per_cpu(hard_watchdog_warn, next_cpu) = true;
+ } else {
+ per_cpu(hard_watchdog_warn, next_cpu) = false;
+ }
+}
+#else
+static inline void watchdog_check_hardlockup_other_cpu(void) { return; }
+#endif
+
static int is_softlockup(unsigned long touch_ts)
{
unsigned long now = get_timestamp();
@@ -297,7 +416,7 @@
return 0;
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
static struct perf_event_attr wd_hw_attr = {
.type = PERF_TYPE_HARDWARE,
@@ -349,8 +468,10 @@
!test_and_set_bit(0, &hardlockup_allcpu_dumped))
trigger_allbutself_cpu_backtrace();
- if (hardlockup_panic)
+ if (hardlockup_panic) {
+ exynos_ss_set_hardlockup(hardlockup_panic);
panic("Hard LOCKUP");
+ }
__this_cpu_write(hard_watchdog_warn, true);
return;
@@ -359,7 +480,7 @@
__this_cpu_write(hard_watchdog_warn, false);
return;
}
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
static void watchdog_interrupt_count(void)
{
@@ -380,9 +501,15 @@
int duration;
int softlockup_all_cpu_backtrace = sysctl_softlockup_all_cpu_backtrace;
+ /* try to enable log_kevent of exynos-snapshot if log_kevent was off because of rcu stall */
+ exynos_ss_try_enable("log_kevent", NSEC_PER_SEC * 15);
+
/* kick the hardlockup detector */
watchdog_interrupt_count();
+ /* test for hardlockups on the next cpu */
+ watchdog_check_hardlockup_other_cpu();
+
/* kick the softlockup detector */
wake_up_process(__this_cpu_read(softlockup_watchdog));
@@ -450,9 +577,12 @@
}
}
- pr_emerg("BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
+ pr_auto(ASL1, "BUG: soft lockup - CPU#%d stuck for %us! [%s:%d]\n",
smp_processor_id(), duration,
current->comm, task_pid_nr(current));
+#ifdef CONFIG_SEC_DEBUG
+ check_softlockup_type();
+#endif
__this_cpu_write(softlockup_task_ptr_saved, current);
print_modules();
print_irqtrace_events(current);
@@ -473,8 +603,15 @@
}
add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
- if (softlockup_panic)
+ if (softlockup_panic) {
+#ifdef CONFIG_SEC_DEBUG_EXTRA_INFO
+ if (regs) {
+ sec_debug_set_extra_info_fault(WATCHDOG_FAULT, (unsigned long)regs->pc, regs);
+ sec_debug_set_extra_info_backtrace(regs);
+ }
+#endif
panic("softlockup: hung tasks");
+ }
__this_cpu_write(soft_watchdog_warn, true);
} else
__this_cpu_write(soft_watchdog_warn, false);
@@ -560,7 +697,7 @@
watchdog_nmi_disable(cpu);
}
-#ifdef CONFIG_HARDLOCKUP_DETECTOR
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_NMI
/*
* People like the simple clean cpu node info on boot.
* Reduce the watchdog noise by only printing messages
@@ -659,9 +796,44 @@
}
#else
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static int watchdog_nmi_enable(unsigned int cpu)
+{
+ /*
+ * The new cpu will be marked online before the first hrtimer interrupt
+ * runs on it. If another cpu tests for a hardlockup on the new cpu
+ * before it has run its first hrtimer, it will get a false positive.
+ * Touch the watchdog on the new cpu to delay the first check for at
+ * least 3 sampling periods to guarantee one hrtimer has run on the new
+ * cpu.
+ */
+ per_cpu(watchdog_nmi_touch, cpu) = true;
+ smp_wmb();
+ cpumask_set_cpu(cpu, &watchdog_cpus);
+ return 0;
+}
+
+static void watchdog_nmi_disable(unsigned int cpu)
+{
+ unsigned int next_cpu = watchdog_next_cpu(cpu);
+
+ /*
+ * Offlining this cpu will cause the cpu before this one to start
+ * checking the one after this one. If this cpu just finished checking
+ * the next cpu and updating hrtimer_interrupts_saved, and then the
+ * previous cpu checks it within one sample period, it will trigger a
+ * false positive. Touch the watchdog on the next cpu to prevent it.
+ */
+ if (next_cpu < nr_cpu_ids)
+ per_cpu(watchdog_nmi_touch, next_cpu) = true;
+ smp_wmb();
+ cpumask_clear_cpu(cpu, &watchdog_cpus);
+}
+#else
static int watchdog_nmi_enable(unsigned int cpu) { return 0; }
static void watchdog_nmi_disable(unsigned int cpu) { return; }
-#endif /* CONFIG_HARDLOCKUP_DETECTOR */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU */
+#endif /* CONFIG_HARDLOCKUP_DETECTOR_NMI */
static struct smp_hotplug_thread watchdog_threads = {
.store = &softlockup_watchdog,
@@ -1051,3 +1223,100 @@
if (watchdog_enabled)
watchdog_enable_all_cpus();
}
+
+#ifdef CONFIG_SEC_DEBUG
+void sl_softirq_entry(const char *softirq_type, void *fn)
+{
+ struct softlockup_info *sl_info = per_cpu_ptr(&percpu_sl_info, smp_processor_id());
+
+ if (softirq_type) {
+ strncpy(sl_info->softirq_info.softirq_type, softirq_type, sizeof(sl_info->softirq_info.softirq_type) - 1);
+ sl_info->softirq_info.softirq_type[SOFTIRQ_TYPE_LEN - 1] = '\0';
+ }
+ sl_info->softirq_info.last_arrival = local_clock();
+ sl_info->softirq_info.fn = fn;
+}
+
+void sl_softirq_exit(void)
+{
+ struct softlockup_info *sl_info = per_cpu_ptr(&percpu_sl_info, smp_processor_id());
+
+ sl_info->softirq_info.last_arrival = 0;
+ sl_info->softirq_info.fn = (void *)0;
+ sl_info->softirq_info.softirq_type[0] = '\0';
+}
+
+void check_softlockup_type(void)
+{
+ int cpu = smp_processor_id();
+ struct softlockup_info *sl_info = per_cpu_ptr(&percpu_sl_info, cpu);
+
+ sl_info->preempt_count = preempt_count();
+ if (softirq_count() &&
+ sl_info->softirq_info.last_arrival != 0 && sl_info->softirq_info.fn != NULL) {
+ sl_info->delay_time = local_clock() - sl_info->softirq_info.last_arrival;
+ sl_info->sl_type = SL_SOFTIRQ_STUCK;
+ pr_auto(ASL9, "Softlockup state: %s, Latency: %lluns, Softirq type: %s, Func: %pf, preempt_count : %x\n",
+ sl_to_name[sl_info->sl_type], sl_info->delay_time, sl_info->softirq_info.softirq_type, sl_info->softirq_info.fn, sl_info->preempt_count);
+ } else {
+ exynos_ss_get_softlockup_info(cpu, sl_info);
+ if (!(preempt_count() & PREEMPT_MASK) || softirq_count())
+ sl_info->sl_type = SL_UNKNOWN_STUCK;
+ pr_auto(ASL9, "Softlockup state: %s, Latency: %lluns, Task: %s, preempt_count: %x\n",
+ sl_to_name[sl_info->sl_type], sl_info->delay_time, sl_info->task_info.task_comm, sl_info->preempt_count);
+ }
+}
+
+unsigned long long get_ess_softlockup_thresh(void)
+{
+ return watchdog_thresh * 2 * NSEC_PER_SEC;
+}
+EXPORT_SYMBOL(get_ess_softlockup_thresh);
+
+#ifdef CONFIG_HARDLOCKUP_DETECTOR_OTHER_CPU
+static void check_hardlockup_type(unsigned int cpu)
+{
+ struct hardlockup_info *hl_info = per_cpu_ptr(&percpu_hl_info, cpu);
+
+ exynos_ss_get_hardlockup_info(cpu, hl_info);
+
+ if (hl_info->hl_type == HL_TASK_STUCK) {
+ pr_auto(ASL9, "Hardlockup state: %s, Latency: %lluns, TASK: %s\n",
+ hl_to_name[hl_info->hl_type], hl_info->delay_time, hl_info->task_info.task_comm);
+ } else if (hl_info->hl_type == HL_IRQ_STUCK) {
+ pr_auto(ASL9, "Hardlockup state: %s, Latency: %lluns, IRQ: %d, Func: %pf\n",
+ hl_to_name[hl_info->hl_type], hl_info->delay_time, hl_info->irq_info.irq, hl_info->irq_info.fn);
+ } else if (hl_info->hl_type == HL_IDLE_STUCK) {
+ pr_auto(ASL9, "Hardlockup state: %s, Latency: %lluns, mode: %s\n",
+ hl_to_name[hl_info->hl_type], hl_info->delay_time, hl_info->cpuidle_info.mode);
+ } else if (hl_info->hl_type == HL_SMC_CALL_STUCK) {
+ pr_auto(ASL9, "Hardlockup state: %s, Latency: %lluns, CMD: %u\n",
+ hl_to_name[hl_info->hl_type], hl_info->delay_time, hl_info->smc_info.cmd);
+ } else if (hl_info->hl_type == HL_IRQ_STORM) {
+ pr_auto(ASL9, "Hardlockup state: %s, Latency: %lluns, IRQ : %d, Func: %pf, Avg period: %lluns\n",
+ hl_to_name[hl_info->hl_type], hl_info->delay_time, hl_info->irq_info.irq, hl_info->irq_info.fn, hl_info->irq_info.avg_period);
+ } else if (hl_info->hl_type == HL_UNKNOWN_STUCK) {
+ pr_auto(ASL9, "Hardlockup state: %s, Latency: %lluns, TASK: %s\n",
+ hl_to_name[hl_info->hl_type], hl_info->delay_time, hl_info->task_info.task_comm);
+ }
+}
+
+void update_hardlockup_type(unsigned int cpu)
+{
+ struct hardlockup_info *hl_info = per_cpu_ptr(&percpu_hl_info, cpu);
+
+ if (hl_info->hl_type == HL_TASK_STUCK && !irqs_disabled()) {
+ hl_info->hl_type = HL_UNKNOWN_STUCK;
+ pr_info("Unknown stuck because IRQ was enabled but IRQ was not generated\n");
+ }
+}
+EXPORT_SYMBOL(update_hardlockup_type);
+
+unsigned long long get_hardlockup_thresh(void)
+{
+ return (hardlockup_thresh * NSEC_PER_SEC - sample_period);
+}
+EXPORT_SYMBOL(get_hardlockup_thresh);
+#endif
+
+#endif
\ No newline at end of file
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
old mode 100644
new mode 100755
index d8a2084..41a52fa
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,6 +48,7 @@
#include <linux/nodemask.h>
#include <linux/moduleparam.h>
#include <linux/uaccess.h>
+#include <linux/exynos-ss.h>
#include "workqueue_internal.h"
@@ -2061,7 +2062,9 @@
lock_map_acquire_read(&pwq->wq->lockdep_map);
lock_map_acquire(&lockdep_map);
trace_workqueue_execute_start(work);
+ exynos_ss_work(worker, worker->task, worker->current_func, ESS_FLAG_IN);
worker->current_func(work);
+ exynos_ss_work(worker, worker->task, worker->current_func, ESS_FLAG_OUT);
/*
* While we must be careful to not use "work" after this, the trace
* point will only record its address.
diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h
old mode 100644
new mode 100755