当前位置: 首页 > news >正文

rt-linux下的D状态的堆栈抓取及TASK_RTLOCK_WAIT状态

一、背景

在之前的博客 缺页异常导致的iowait打印出相关文件的绝对路径-CSDN博客 里的 2.1 一节里的代码,我们已经有了一个比较强大的抓取D状态和等IO状态超过阈值的waker和wakee的堆栈状态的内核模块。在之前的博客 增加等IO状态的唤醒堆栈打印及缺页异常导致iowait分析-CSDN博客 里的 2.3 一节,我们也针对了一些特殊的情况,即在in_iowait状态下的非TASK_UNINTERRUPTIBLE状态的情况也考虑到并进行了超阈值的堆栈打印:

对于iowait的情形,这样的监控程序和上面提到的考虑已经是足够了,针对iowait情形,我们还打印了waker和wakee的堆栈。而对于D状态的超时而言,我们当前的程序并没有覆盖全场景,我们并没有打印D状态的waker堆栈,我们也没有考虑TASK_UNINTERRUPTIBLE这个bit是1但是其他bit也是1的情况。另外,在这篇博客里,我们也会讲到,对于rt-linux内核而言,还有一种特殊的D状态,即TASK_RTLOCK_WAIT状态,这个TASK_RTLOCK_WAIT状态在rt-linux里也是一个很普遍存在的情况,不抓取这样的情况就漏了很多D状态

下面第二章里,我们给出更新后的源码(源码里去掉iowait的抓取,因为iowait的抓取之前的程序已经足够了,这篇博客只关注rt-linux下的D状态的抓取,注意这里说的D状态是一个广义的D状态,即在perfetto里显示出是D状态那就被视为是D状态,并不应该理解成是__state == TASK_UNINTERRUPTIBLE,也不应该理解成是__state的里TASK_UNINTERRUPTIBLE的mask是TASK_UNINTERRUPTIBLE,具体在下面第三章里展开说明)。

然后在第三章里,我们讲解这次源码的改动的部分和与TASK_RTLOCK_WAIT相关的细节。

二、源码及抓取效果

2.1 源码

#include <linux/module.h>
#include <linux/capability.h>
#include <linux/sched.h>
#include <linux/uaccess.h>
#include <linux/proc_fs.h>
#include <linux/ctype.h>
#include <linux/seq_file.h>
#include <linux/poll.h>
#include <linux/types.h>
#include <linux/ioctl.h>
#include <linux/errno.h>
#include <linux/stddef.h>
#include <linux/lockdep.h>
#include <linux/kthread.h>
#include <linux/sched.h>
#include <linux/delay.h>
#include <linux/wait.h>
#include <linux/init.h>
#include <asm/atomic.h>
#include <trace/events/workqueue.h>
#include <linux/sched/clock.h>
#include <linux/string.h>
#include <linux/mm.h>
#include <linux/interrupt.h>
#include <linux/tracepoint.h>
#include <trace/events/osmonitor.h>
#include <trace/events/sched.h>
#include <trace/events/irq.h>
#include <trace/events/kmem.h>
#include <linux/ptrace.h>
#include <linux/uaccess.h>
#include <asm/processor.h>
#include <linux/sched/task_stack.h>
#include <linux/nmi.h>
#include <linux/version.h>
#include <linux/sched/mm.h>
#include <asm/irq_regs.h>
#include <linux/kallsyms.h>
#include <linux/kprobes.h>
#include <linux/stop_machine.h>MODULE_LICENSE("GPL");
MODULE_AUTHOR("zhaoxin");
MODULE_DESCRIPTION("Module for monitor D tasks.");
MODULE_VERSION("1.0");static unsigned long ns = 5000000ull;module_param(ns, ulong, S_IRUGO);
MODULE_PARM_DESC(ns, "threshold nano second");#define IODELAY_TRACEPOINT_ENABLE#define TEST_STACK_TRACE_ENTRIES   32typedef unsigned int (*stack_trace_save_tsk_func)(struct task_struct *task,unsigned long *store, unsigned int size,unsigned int skipnr);
stack_trace_save_tsk_func _stack_trace_save_tsk;typedef int (*get_cmdline_func)(struct task_struct *task, char *buffer, int buflen);
get_cmdline_func _get_cmdline_func;#define TESTDIOMONITOR_SAMPLEDESC_SWDSTART  "swDstart"
#define TESTDIOMONITOR_SAMPLEDESC_WADSTOP    "waDstop"
#define TESTDIOMONITOR_SAMPLEDESC_SWDIOSTART "swDiostart"
#define TESTDIOMONITOR_SAMPLEDESC_WADIOSTOP  "waDiostop"
#define TESTDIOMONITOR_SAMPLEDESC_DEXCEED    "Dexceed"
#define TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED  "Dioexceed"
#define TESTDIOMONITOR_SAMPLEDESC_IOEXCEED   "Ioexceed"#define TESTDIOMONITOR_SIMPLE#ifdef TESTDIOMONITOR_SIMPLE
#define TESTDIOMONITOR_SIMPLE_THRESHOLDNS   (ns)//5000000ull
#endif// 1ms
//#define TESTDIOMONITOR_DEXCEED_THRESHOLD     1000ull//1000000ull#ifdef CONFIG_UCLAMP_TASK
struct uclamp_bucket {unsigned long value : bits_per(SCHED_CAPACITY_SCALE);unsigned long tasks : BITS_PER_LONG - bits_per(SCHED_CAPACITY_SCALE);
};struct uclamp_rq {unsigned int value;struct uclamp_bucket bucket[UCLAMP_BUCKETS];
};
#endif/* CFS-related fields in a runqueue */
struct cfs_rq {struct load_weight	load;unsigned int		nr_running;unsigned int		h_nr_running;      /* SCHED_{NORMAL,BATCH,IDLE} */unsigned int		idle_nr_running;   /* SCHED_IDLE */unsigned int		idle_h_nr_running; /* SCHED_IDLE */u64			exec_clock;u64			min_vruntime;
#ifdef CONFIG_SCHED_COREunsigned int		forceidle_seq;u64			min_vruntime_fi;
#endif#ifndef CONFIG_64BITu64			min_vruntime_copy;
#endifstruct rb_root_cached	tasks_timeline;/** 'curr' points to currently running entity on this cfs_rq.* It is set to NULL otherwise (i.e when none are currently running).*/struct sched_entity	*curr;struct sched_entity	*next;struct sched_entity	*last;struct sched_entity	*skip;#ifdef	CONFIG_SCHED_DEBUGunsigned int		nr_spread_over;
#endif#ifdef CONFIG_SMP/** CFS load tracking*/struct sched_avg	avg;
#ifndef CONFIG_64BITu64			last_update_time_copy;
#endifstruct {raw_spinlock_t	lock ____cacheline_aligned;int		nr;unsigned long	load_avg;unsigned long	util_avg;unsigned long	runnable_avg;} removed;#ifdef CONFIG_FAIR_GROUP_SCHEDunsigned long		tg_load_avg_contrib;long			propagate;long			prop_runnable_sum;/**   h_load = weight * f(tg)** Where f(tg) is the recursive weight fraction assigned to* this group.*/unsigned long		h_load;u64			last_h_load_update;struct sched_entity	*h_load_next;
#endif /* CONFIG_FAIR_GROUP_SCHED */
#endif /* CONFIG_SMP */#ifdef CONFIG_FAIR_GROUP_SCHEDstruct rq		*rq;	/* CPU runqueue to which this cfs_rq is attached *//** leaf cfs_rqs are those that hold tasks (lowest schedulable entity in* a hierarchy). Non-leaf lrqs hold other higher schedulable entities* (like users, containers etc.)** leaf_cfs_rq_list ties together list of leaf cfs_rq's in a CPU.* This list is used during load balance.*/int			on_list;struct list_head	leaf_cfs_rq_list;struct task_group	*tg;	/* group that "owns" this runqueue *//* Locally cached copy of our task_group's idle value */int			idle;#ifdef CONFIG_CFS_BANDWIDTHint			runtime_enabled;s64			runtime_remaining;u64			throttled_pelt_idle;
#ifndef CONFIG_64BITu64                     throttled_pelt_idle_copy;
#endifu64			throttled_clock;u64			throttled_clock_pelt;u64			throttled_clock_pelt_time;int			throttled;int			throttle_count;struct list_head	throttled_list;
#endif /* CONFIG_CFS_BANDWIDTH */
#endif /* CONFIG_FAIR_GROUP_SCHED */
};/** This is the priority-queue data structure of the RT scheduling class:*/
struct rt_prio_array {DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */struct list_head queue[MAX_RT_PRIO];
};/* Real-Time classes' related field in a runqueue: */
struct rt_rq {struct rt_prio_array	active;unsigned int		rt_nr_running;unsigned int		rr_nr_running;
#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHEDstruct {int		curr; /* highest queued rt task prio */
#ifdef CONFIG_SMPint		next; /* next highest */
#endif} highest_prio;
#endif
#ifdef CONFIG_SMPunsigned int		rt_nr_migratory;unsigned int		rt_nr_total;int			overloaded;struct plist_head	pushable_tasks;#endif /* CONFIG_SMP */int			rt_queued;int			rt_throttled;u64			rt_time;u64			rt_runtime;/* Nests inside the rq lock: */raw_spinlock_t		rt_runtime_lock;#ifdef CONFIG_RT_GROUP_SCHEDunsigned int		rt_nr_boosted;struct rq		*rq;struct task_group	*tg;
#endif
};/* Deadline class' related fields in a runqueue */
struct dl_rq {/* runqueue is an rbtree, ordered by deadline */struct rb_root_cached	root;unsigned int		dl_nr_running;#ifdef CONFIG_SMP/** Deadline values of the currently executing and the* earliest ready task on this rq. Caching these facilitates* the decision whether or not a ready but not running task* should migrate somewhere else.*/struct {u64		curr;u64		next;} earliest_dl;unsigned int		dl_nr_migratory;int			overloaded;/** Tasks on this rq that can be pushed away. They are kept in* an rb-tree, ordered by tasks' deadlines, with caching* of the leftmost (earliest deadline) element.*/struct rb_root_cached	pushable_dl_tasks_root;
#elsestruct dl_bw		dl_bw;
#endif/** "Active utilization" for this runqueue: increased when a* task wakes up (becomes TASK_RUNNING) and decreased when a* task blocks*/u64			running_bw;/** Utilization of the tasks "assigned" to this runqueue (including* the tasks that are in runqueue and the tasks that executed on this* CPU and blocked). Increased when a task moves to this runqueue, and* decreased when the task moves away (migrates, changes scheduling* policy, or terminates).* This is needed to compute the "inactive utilization" for the* runqueue (inactive utilization = this_bw - running_bw).*/u64			this_bw;u64			extra_bw;/** Inverse of the fraction of CPU utilization that can be reclaimed* by the GRUB algorithm.*/u64			bw_ratio;
};/** This is the main, per-CPU runqueue data structure.** Locking rule: those places that want to lock multiple runqueues* (such as the load balancing or the thread migration code), lock* acquire operations must be ordered by ascending &runqueue.*/
struct rq {/* runqueue lock: */raw_spinlock_t		__lock;/** nr_running and cpu_load should be in the same cacheline because* remote CPUs use both these fields when doing load calculation.*/unsigned int		nr_running;
#ifdef CONFIG_NUMA_BALANCINGunsigned int		nr_numa_running;unsigned int		nr_preferred_running;unsigned int		numa_migrate_on;
#endif
#ifdef CONFIG_NO_HZ_COMMON
#ifdef CONFIG_SMPunsigned long		last_blocked_load_update_tick;unsigned int		has_blocked_load;call_single_data_t	nohz_csd;
#endif /* CONFIG_SMP */unsigned int		nohz_tick_stopped;atomic_t		nohz_flags;
#endif /* CONFIG_NO_HZ_COMMON */#ifdef CONFIG_SMPunsigned int		ttwu_pending;
#endifu64			nr_switches;#ifdef CONFIG_UCLAMP_TASK/* Utilization clamp values based on CPU's RUNNABLE tasks */struct uclamp_rq	uclamp[UCLAMP_CNT] ____cacheline_aligned;unsigned int		uclamp_flags;
#define UCLAMP_FLAG_IDLE 0x01
#endifstruct cfs_rq		cfs;struct rt_rq		rt;struct dl_rq		dl;#ifdef CONFIG_FAIR_GROUP_SCHED/* list of leaf cfs_rq on this CPU: */struct list_head	leaf_cfs_rq_list;struct list_head	*tmp_alone_branch;
#endif /* CONFIG_FAIR_GROUP_SCHED *//** This is part of a global counter where only the total sum* over all CPUs matters. A task can increase this counter on* one CPU and if it got migrated afterwards it may decrease* it on another CPU. Always updated under the runqueue lock:*/unsigned int		nr_uninterruptible;struct task_struct __rcu	*curr;struct task_struct	*idle;struct task_struct	*stop;unsigned long		next_balance;struct mm_struct	*prev_mm;unsigned int		clock_update_flags;u64			clock;/* Ensure that all clocks are in the same cache line */u64			clock_task ____cacheline_aligned;u64			clock_pelt;unsigned long		lost_idle_time;u64			clock_pelt_idle;u64			clock_idle;
#ifndef CONFIG_64BITu64			clock_pelt_idle_copy;u64			clock_idle_copy;
#endifatomic_t		nr_iowait;#ifdef CONFIG_SCHED_DEBUGu64 last_seen_need_resched_ns;int ticks_without_resched;
#endif#ifdef CONFIG_MEMBARRIERint membarrier_state;
#endif#ifdef CONFIG_SMPstruct root_domain		*rd;struct sched_domain __rcu	*sd;unsigned long		cpu_capacity;unsigned long		cpu_capacity_orig;struct balance_callback *balance_callback;unsigned char		nohz_idle_balance;unsigned char		idle_balance;unsigned long		misfit_task_load;/* For active balancing */int			active_balance;int			push_cpu;struct cpu_stop_work	active_balance_work;/* CPU of this runqueue: */int			cpu;int			online;struct list_head cfs_tasks;struct sched_avg	avg_rt;struct sched_avg	avg_dl;
#ifdef CONFIG_HAVE_SCHED_AVG_IRQstruct sched_avg	avg_irq;
#endif
#ifdef CONFIG_SCHED_THERMAL_PRESSUREstruct sched_avg	avg_thermal;
#endifu64			idle_stamp;u64			avg_idle;unsigned long		wake_stamp;u64			wake_avg_idle;/* This is used to determine avg_idle's max value */u64			max_idle_balance_cost;#ifdef CONFIG_HOTPLUG_CPUstruct rcuwait		hotplug_wait;
#endif
#endif /* CONFIG_SMP */#ifdef CONFIG_IRQ_TIME_ACCOUNTINGu64			prev_irq_time;u64			psi_irq_time;
#endif
#ifdef CONFIG_PARAVIRTu64			prev_steal_time;
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTINGu64			prev_steal_time_rq;
#endif/* calc_load related fields */unsigned long		calc_load_update;long			calc_load_active;#ifdef CONFIG_SCHED_HRTICK
#ifdef CONFIG_SMPcall_single_data_t	hrtick_csd;
#endifstruct hrtimer		hrtick_timer;ktime_t 		hrtick_time;
#endif#ifdef CONFIG_SCHEDSTATS/* latency stats */struct sched_info	rq_sched_info;unsigned long long	rq_cpu_time;/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? *//* sys_sched_yield() stats */unsigned int		yld_count;/* schedule() stats */unsigned int		sched_count;unsigned int		sched_goidle;/* try_to_wake_up() stats */unsigned int		ttwu_count;unsigned int		ttwu_local;
#endif#ifdef CONFIG_CPU_IDLE/* Must be inspected within a rcu lock section */struct cpuidle_state	*idle_state;
#endif#ifdef CONFIG_SMPunsigned int		nr_pinned;
#endifunsigned int		push_busy;struct cpu_stop_work	push_work;#ifdef CONFIG_SCHED_CORE/* per rq */struct rq		*core;struct task_struct	*core_pick;unsigned int		core_enabled;unsigned int		core_sched_seq;struct rb_root		core_tree;/* shared state -- careful with sched_core_cpu_deactivate() */unsigned int		core_task_seq;unsigned int		core_pick_seq;unsigned long		core_cookie;unsigned int		core_forceidle_count;unsigned int		core_forceidle_seq;unsigned int		core_forceidle_occupation;u64			core_forceidle_start;
#endif
};// runqueues (not export symbol)
struct rq* _prq = NULL;struct rq* my_cpu_rq(int i_cpu)
{return per_cpu_ptr(_prq, i_cpu);
}u64 my_rq_clock_task(void)
{struct rq* prq = my_cpu_rq(smp_processor_id());return prq->clock_task;
}#define TESTDIOMONITOR_FILE_MAXLEN  1024typedef struct testdiomonitor_sample {struct timespec64 time;int cpu;int pid;int tgid;int ppid;char comm[TASK_COMM_LEN];char ppidcomm[TASK_COMM_LEN];// 0 or 1int bin_iowait;/** "swDstart" // 在sched_switch里* "waDstop"  // 在sched_waking里* "swDiostart" // 在sched_switch里* "waDiostop"  // 在sched_waking里* "Dexceed"    // 超出阈值,非iowait* "Dioexceed"  // 超出阈值,iowait*/const char* desc;u64 dtimens;    // 纳秒单位,D状态持续的时间u64 iowaittimens;   // 纳秒单位,等待io的时间int stackn;void* parray_stack[TEST_STACK_TRACE_ENTRIES];int wakercpu;int wakerpid;int wakertgid;int wakerppid;char wakercomm[TASK_COMM_LEN];char wakerppidcomm[TASK_COMM_LEN];int wakerstackn;void* parray_wakerstack[TEST_STACK_TRACE_ENTRIES];char filepath[TESTDIOMONITOR_FILE_MAXLEN];u32 __state;u64 exec_start_begin;u64 exec_start_end;u64 local_clock_now;u64 clock_task_curr;u32 writedone;  // 0 or 1
} testdiomonitor_sample;#define TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT  8192*4typedef struct testdiomonitor_sample_ringbuff {testdiomonitor_sample* parray_sample;volatile u64 wp;    // Index is wp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1).volatile u64 rp;    // Index is rp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1).u32 skipcount;  // 0 means no skip any abnormal event
} testdiomonitor_sample_ringbuff;#define TESTDIOMONITOR_LINEBUFF  1024typedef struct testdiomonitor_env {struct file* file;char file_linebuff[TESTDIOMONITOR_LINEBUFF];int headoffset;loff_t file_pos;testdiomonitor_sample_ringbuff ringbuff;
} testdiomonitor_env;static testdiomonitor_env _env;static struct delayed_work work_write_file;
static struct workqueue_struct *wq_write_file;#define FILENAME        "test_new_j6_full.txt"void init_file(void)
{_env.file = filp_open(FILENAME, O_WRONLY | O_CREAT | O_TRUNC, 0644);if (IS_ERR(_env.file)) {_env.file = NULL;}
}void exit_file(void)
{if (_env.file) {filp_close(_env.file, NULL);}
}void testdiomonitor_write_file(char* i_pchar, int i_size)
{if (_env.file) {kernel_write(_env.file, i_pchar, i_size, &_env.file_pos);}
}void testdiomonitor_write_file_emptyline(void)
{testdiomonitor_write_file("\n", strlen("\n"));
}void testdiomonitor_file_oneline(const char* i_format, ...)
{char* pcontent = &_env.file_linebuff[_env.headoffset];va_list args;va_start(args, i_format);vsnprintf(pcontent, TESTDIOMONITOR_LINEBUFF - _env.headoffset, i_format, args);va_end(args);testdiomonitor_write_file(_env.file_linebuff, strlen(_env.file_linebuff));
}void testdiomonitor_replace_null_with_space(char *str, int n) {for (int i = 0; i < n - 1; i++) {if (str[i] == '\0') {str[i] = ' ';}}
}void testdiomonitor_set_cmdline(char* i_pbuff, int i_buffsize, struct task_struct* i_ptask)
{int ret = _get_cmdline_func(i_ptask, i_pbuff, i_buffsize);if (ret <= 0) {i_pbuff[0] = '\0';return;}testdiomonitor_replace_null_with_space(i_pbuff, ret);i_pbuff[ret - 1] = '\0';
}void testdiomonitor_checkget_parentinfo_and_cmdline(testdiomonitor_sample* io_psample, struct task_struct* i_ptask)
{struct task_struct* parent;rcu_read_lock();parent = rcu_dereference(i_ptask->real_parent);io_psample->ppid = parent->pid;strlcpy(io_psample->ppidcomm, parent->comm, TASK_COMM_LEN);rcu_read_unlock();
}void testdiomonitor_checkget_parentinfo_and_cmdline_waker(testdiomonitor_sample* io_psample, struct task_struct* i_ptask)
{struct task_struct* parent;rcu_read_lock();parent = rcu_dereference(i_ptask->real_parent);io_psample->wakerppid = parent->pid;strlcpy(io_psample->wakerppidcomm, parent->comm, TASK_COMM_LEN);rcu_read_unlock();
}#define TESTDIOMONITOR_COMMANDLINE_MAX 128int contains_ls(char *str) {const char *substr = "ls";size_t len = strlen(substr); // 获取子字符串的长度const char *p = str;while ((p = strchr(p, substr[0])) != NULL) { // 查找第一个字符 'l'if (strncmp(p, substr, len) == 0) { // 比较后续的字符return 1; // 找到了}p++; // 移动到下一个字符}return 0; // 没有找到
}static void write_file(struct work_struct *w)
{//ssize_t ret;u32 index;testdiomonitor_sample* psample;struct tm t;char timestr[64];char exceedstr[64];char temp_commandline[TESTDIOMONITOR_COMMANDLINE_MAX];struct pid* pid_struct;struct task_struct* ptask;int stacki;while (_env.ringbuff.rp != _env.ringbuff.wp) {index = (_env.ringbuff.rp & (TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1));psample = &_env.ringbuff.parray_sample[index];if (psample->writedone != 1) {break;}testdiomonitor_write_file_emptyline();_env.headoffset = sprintf(_env.file_linebuff, "[%llu][%s] ", _env.ringbuff.rp, psample->desc);time64_to_tm(psample->time.tv_sec + 8 * 60 * 60, 0, &t);snprintf(timestr, 64, "%04ld-%02d-%02d-%02d_%02d_%02d.%09ld",1900 + t.tm_year, t.tm_mon + 1, t.tm_mday, t.tm_hour, t.tm_min, t.tm_sec, psample->time.tv_nsec);if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DEXCEED) {snprintf(exceedstr, 64, "dtimens[%llu]", psample->dtimens);}else if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED) {snprintf(exceedstr, 64, "iowaittimens[%llu]", psample->iowaittimens);}else if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_IOEXCEED) {snprintf(exceedstr, 64, "delayacct_iowaittimens[%llu]", psample->iowaittimens);}else {exceedstr[0] = '\0';}//if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED) {if (psample->desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DEXCEED) {testdiomonitor_file_oneline("[skipcount:%u]begin...time[%s]wakercpu[%d]desc[%s]%s\n", _env.ringbuff.skipcount, timestr, psample->wakercpu, psample->desc, "wakerDexceed");testdiomonitor_file_oneline("wakertgid[%d]wakerpid[%d]wakercomm[%s]wakerppid[%d]wakerppidcomm[%s]\n",psample->wakertgid, psample->wakerpid, psample->wakercomm, psample->wakerppid, psample->wakerppidcomm);pid_struct = find_get_pid(psample->wakerpid);if (pid_struct) {ptask = get_pid_task(pid_struct, PIDTYPE_PID);if (ptask) {testdiomonitor_set_cmdline(temp_commandline, TESTDIOMONITOR_COMMANDLINE_MAX, ptask);put_task_struct(ptask);}else {temp_commandline[0] = '\0';}put_pid(pid_struct);}else {temp_commandline[0] = '\0';}testdiomonitor_file_oneline("wakercommandline[%s]\n", temp_commandline);pid_struct = find_get_pid(psample->wakerppid);if (pid_struct) {ptask = get_pid_task(pid_struct, PIDTYPE_PID);if (ptask) {testdiomonitor_set_cmdline(temp_commandline, TESTDIOMONITOR_COMMANDLINE_MAX, ptask);put_task_struct(ptask);}else {temp_commandline[0] = '\0';}put_pid(pid_struct);}else {temp_commandline[0] = '\0';}testdiomonitor_file_oneline("wakerppid_commandline[%s]\n", temp_commandline);testdiomonitor_file_oneline("stack[%d]:\n", psample->wakerstackn);for (stacki = 0; stacki < psample->wakerstackn; stacki++) {testdiomonitor_file_oneline("%*c%pS\n", 5, ' ', (void *)psample->parray_wakerstack[stacki]);}testdiomonitor_file_oneline("cpu[%d]desc[%s]%s\n", psample->cpu, psample->desc, exceedstr);}else {testdiomonitor_file_oneline("begin...time[%s]cpu[%d]desc[%s]%s\n", timestr, psample->cpu, psample->desc, exceedstr);}testdiomonitor_file_oneline("tgid[%d]pid[%d]comm[%s]ppid[%d]ppidcomm[%s]\n",psample->tgid, psample->pid, psample->comm, psample->ppid, psample->ppidcomm);{const char *desc = "NA";if (psample->__state == TASK_UNINTERRUPTIBLE) {desc = "D";}else if (psample->__state == TASK_KILLABLE) {desc = "K";}testdiomonitor_file_oneline("iniowait[%u]__state[%u][%s]exec_start_begin[%llu]exec_start_end[%llu]local_clock[%llu]clock_task_curr[%llu]\n",psample->bin_iowait ? 1 : 0, psample->__state, desc, psample->exec_start_begin, psample->exec_start_end, psample->local_clock_now, psample->clock_task_curr);}pid_struct = find_get_pid(psample->pid);if (pid_struct) {ptask = get_pid_task(pid_struct, PIDTYPE_PID);if (ptask) {testdiomonitor_set_cmdline(temp_commandline, TESTDIOMONITOR_COMMANDLINE_MAX, ptask);put_task_struct(ptask);}else {temp_commandline[0] = '\0';}put_pid(pid_struct);}else {temp_commandline[0] = '\0';}testdiomonitor_file_oneline("commandline[%s]\n", temp_commandline);pid_struct = find_get_pid(psample->ppid);if (pid_struct) {ptask = get_pid_task(pid_struct, PIDTYPE_PID);if (ptask) {testdiomonitor_set_cmdline(temp_commandline, TESTDIOMONITOR_COMMANDLINE_MAX, ptask);put_task_struct(ptask);}else {temp_commandline[0] = '\0';}put_pid(pid_struct);}else {temp_commandline[0] = '\0';}testdiomonitor_file_oneline("ppid_commandline[%s]\n", temp_commandline);//testdiomonitor_file_oneline("filepath[%s]\n", psample->filepath);testdiomonitor_file_oneline("stack[%d]:\n", psample->stackn);for (stacki = 0; stacki < psample->stackn; stacki++) {testdiomonitor_file_oneline("%*c%pS\n", 5, ' ', (void *)psample->parray_stack[stacki]);}testdiomonitor_write_file_emptyline();smp_wmb();psample->writedone = 0;_env.ringbuff.rp ++;}queue_delayed_work_on(nr_cpu_ids - 1, wq_write_file,&work_write_file, 1);
}static void init_write_file(void)
{init_file();wq_write_file = alloc_workqueue("testdiomonitor_write_file", WQ_MEM_RECLAIM, 0);INIT_DELAYED_WORK(&work_write_file, write_file);queue_delayed_work_on(nr_cpu_ids - 1, wq_write_file,&work_write_file, 3);
}static void exit_write_file(void)
{cancel_delayed_work_sync(&work_write_file);destroy_workqueue(wq_write_file);exit_file();
}void init_testdiomonitor_sample_ringbuff(void) 
{//testdiomonitor_sample* psample;_env.ringbuff.parray_sample = kvzalloc(sizeof(testdiomonitor_sample) * TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT, GFP_KERNEL);
}void exit_testdiomonitor_sample_ringbuff(void)
{kvfree(_env.ringbuff.parray_sample);
}testdiomonitor_sample* testdiomonitor_get_psample(void)
{u64 windex_raw, windex_raw_old;u32 windex;while (1) {windex_raw = _env.ringbuff.wp;if (windex_raw - _env.ringbuff.rp >= (u64)(TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT)) {_env.ringbuff.skipcount ++;return NULL;}// atomic_cmpxchg return old valuewindex_raw_old = atomic64_cmpxchg((atomic64_t*)&_env.ringbuff.wp,windex_raw, windex_raw + 1);if (windex_raw_old == windex_raw) {break;}}windex = (u32)(windex_raw & (u64)(TESTDIOMONITOR_SAMPLE_RINGBUFF_MAXCOUNT - 1));return &_env.ringbuff.parray_sample[windex];
}static u64 _magic_number = 0xABCDEFull;void* _dl_sched_class = NULL;int get_file_dir_by_folio(struct folio *i_fo, char* i_path, int i_len);void testdiomonitor_add_sample(const char* i_desc, struct task_struct* i_task, u64 i_timens, u32 i_state, u64 i_exec_start_begin, u64 i_exec_start_end, u64 i_local_clock, u64 i_clock_task_curr)
{testdiomonitor_sample* psample = testdiomonitor_get_psample();if (!psample) {return;}ktime_get_real_ts64(&psample->time);psample->cpu = task_cpu(i_task);psample->pid = i_task->pid;psample->tgid = i_task->tgid;strlcpy(psample->comm, i_task->comm, TASK_COMM_LEN);testdiomonitor_checkget_parentinfo_and_cmdline(psample, i_task);psample->bin_iowait = i_task->in_iowait;psample->desc = i_desc;if (i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DEXCEED) {psample->dtimens = i_timens;}else if (i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED || i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_IOEXCEED) {psample->iowaittimens = i_timens;}psample->stackn = _stack_trace_save_tsk(i_task, (unsigned long*)psample->parray_stack, TEST_STACK_TRACE_ENTRIES, 0);//if (i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED) {if (i_desc == (const char*)TESTDIOMONITOR_SAMPLEDESC_DEXCEED) {psample->__state = i_state;psample->exec_start_begin = i_exec_start_begin;psample->exec_start_end = i_exec_start_end;psample->local_clock_now = i_local_clock;psample->clock_task_curr = i_clock_task_curr;psample->wakercpu = smp_processor_id();psample->wakerpid = current->pid;psample->wakertgid = current->tgid;strlcpy(psample->wakercomm, current->comm, TASK_COMM_LEN);testdiomonitor_checkget_parentinfo_and_cmdline_waker(psample, current);psample->wakerstackn = _stack_trace_save_tsk(current, (unsigned long*)psample->parray_wakerstack, TEST_STACK_TRACE_ENTRIES, 0);// psample->filepath[0] = '\0';// if ((void*)i_task->sched_class != (void*)&_dl_sched_class) {//     if (i_task->dl.dl_runtime == _magic_number) {//         //if (sched_clock() - i_task->dl.dl_deadline >= TESTDIOMONITOR_SIMPLE_THRESHOLDNS) //         {//             //printk("__folio_lock_killable wait %llu ns\n", sched_clock() - current->dl.dl_deadline);//             //dump_stack();//             if (get_file_dir_by_folio((struct folio*)i_task->dl.dl_period, psample->filepath, TESTDIOMONITOR_FILE_MAXLEN) < 0) {//                 //printk("get_file_dir_by_folio fail!\n");//             }//         }//         current->dl.dl_runtime = 0;//     }// }}smp_wmb();psample->writedone = 1;
}static void cb_sched_switch(void *i_data, bool i_preempt,struct task_struct *i_prev,struct task_struct *i_next,unsigned int i_prev_state)
{
#ifndef TESTDIOMONITOR_SIMPLEvoid* parray_stack[TEST_STACK_TRACE_ENTRIES];int num_stack;int stacki;if (i_prev_state == TASK_UNINTERRUPTIBLE) {if (i_prev->in_iowait) {testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_SWDIOSTART, i_prev, 0);}else {testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_SWDSTART, i_prev, 0);}}else if (i_prev->in_iowait) {testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_SWDIOSTART, i_prev, 0);}
#endif
}const char* getstatstr_bystate(u32 i_state) {switch (i_state) {case TASK_RUNNING:return "TASK_RUNNING";case TASK_INTERRUPTIBLE:return "TASK_INTERRUPTIBLE";case TASK_UNINTERRUPTIBLE:return "TASK_UNINTERRUPTIBLE";default:return "other";}
}static void cb_sched_waking(void *i_data, struct task_struct *i_p) {if ((i_p->__state & TASK_UNINTERRUPTIBLE) == TASK_UNINTERRUPTIBLE) {//u64 currns = my_rq_clock_task();struct rq* prq = my_cpu_rq(task_cpu(i_p));struct rq* prq_curr = my_cpu_rq(smp_processor_id());u64 currns = prq->clock_task;u64 currns_curr = prq_curr->clock_task;//u64 local_c = local_clock();//int cpuid = smp_processor_id();//if (i_p->in_iowait) {
#ifndef TESTDIOMONITOR_SIMPLEtestdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_WADIOSTOP, i_p, 0);
#endif
#ifdef TESTDIOMONITOR_SIMPLE//if (currns - i_p->se.exec_start >= TESTDIOMONITOR_SIMPLE_THRESHOLDNS)
#endif//testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED, i_p, currns - i_p->se.exec_start);
#ifndef TESTDIOMONITOR_SIMPLEif (i_p->se.exec_start > currns) {//if (task_cpu(i_p) == cpuid) {printk("comm[%s]pid[%d]exec_start[%llu]currns[%llu]local_clock[%llu]last_cpu[%d]cpuid[%d]\n", i_p->comm, i_p->pid, i_p->se.exec_start, currns, local_c, task_cpu(i_p), cpuid);}}// if (printk_ratelimit()) {//     printk("waking dump_stack[D]:\n");//     dump_stack();// }
#endif}
//#ifndef TESTDIOMONITOR_SIMPLE//else {//testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_WADSTOP, i_p, 0);if (currns - i_p->se.exec_start >= TESTDIOMONITOR_SIMPLE_THRESHOLDNS) {//if (strcmp(current->comm, "ls") == 0) {//if (strcmp(i_p->comm, "hobot-log") != 0) {testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_DEXCEED, i_p, currns - i_p->se.exec_start, i_p->__state, i_p->se.exec_start, currns, local_clock(), currns_curr);}return;//}}if (i_p->se.exec_start > currns) {//if (task_cpu(i_p) == cpuid) // {//     printk("comm[%s]pid[%d]exec_start[%llu]currns[%llu]local_clock[%llu]last_cpu[%d]cpuid[%d]\n", //         i_p->comm, i_p->pid, i_p->se.exec_start, currns, local_c, task_cpu(i_p), cpuid);// }}}
//#endif}if (strcmp(current->comm, "ls") == 0|| strcmp(current->comm, "wc") == 0|| strcmp(current->comm, "grep") == 0|| strcmp(current->comm, "awk") == 0) {if (strcmp(i_p->comm, "hobot-log") != 0&& strcmp(i_p->comm, "kthreadd") != 0) {struct rq* prq = my_cpu_rq(task_cpu(i_p));struct rq* prq_curr = my_cpu_rq(smp_processor_id());u64 currns = prq->clock_task;u64 currns_curr = prq_curr->clock_task;testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_DEXCEED, i_p, currns - i_p->se.exec_start, i_p->__state, i_p->se.exec_start, currns, local_clock(), currns_curr);}}//else if (i_p->in_iowait) {//struct rq* prq = my_cpu_rq(task_cpu(i_p));//u64 currns = prq->clock_task;//u64 local_c = local_clock();//int cpuid = smp_processor_id();//if (printk_ratelimit()) // {//     printk("i_p->__state=[%u][%s]\n", i_p->__state, getstatstr_bystate(i_p->__state));//     printk("waking dump_stack[K]:\n");//     dump_stack();// }
#ifndef TESTDIOMONITOR_SIMPLEtestdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_WADIOSTOP, i_p, 0);
#endif
#ifdef TESTDIOMONITOR_SIMPLE//if (currns - i_p->se.exec_start >= TESTDIOMONITOR_SIMPLE_THRESHOLDNS)
#endif//testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_DIOEXCEED, i_p, currns - i_p->se.exec_start);}
}static void cb_iodelay_account(void *i_data, struct task_struct *i_curr,unsigned long long i_delta)
{
#ifdef TESTDIOMONITOR_SIMPLE//if (i_delta >= TESTDIOMONITOR_SIMPLE_THRESHOLDNS)
#endif//testdiomonitor_add_sample(TESTDIOMONITOR_SAMPLEDESC_IOEXCEED, i_curr, i_delta);
}struct kern_tracepoint {void *callback;struct tracepoint *ptr;bool bregister;
};
static void clear_kern_tracepoint(struct kern_tracepoint *tp)
{if (tp->bregister) {tracepoint_probe_unregister(tp->ptr, tp->callback, NULL);}
}#define INIT_KERN_TRACEPOINT(tracepoint_name) \static struct kern_tracepoint mykern_##tracepoint_name = {.callback = NULL, .ptr = NULL, .bregister = false};#define TRACEPOINT_CHECK_AND_SET(tracepoint_name)                                             \static void tracepoint_name##_tracepoint_check_and_set(struct tracepoint *tp, void *priv) \{                                                                                \if (!strcmp(#tracepoint_name, tp->name))                                     \{                                                                            \((struct kern_tracepoint *)priv)->ptr = tp;                          \return;                                                                  \}                                                                            \}INIT_KERN_TRACEPOINT(sched_switch)
TRACEPOINT_CHECK_AND_SET(sched_switch)
INIT_KERN_TRACEPOINT(sched_waking)
TRACEPOINT_CHECK_AND_SET(sched_waking)
#ifdef IODELAY_TRACEPOINT_ENABLE
INIT_KERN_TRACEPOINT(iodelay_account)
TRACEPOINT_CHECK_AND_SET(iodelay_account)
#endiftypedef unsigned long (*kallsyms_lookup_name_func)(const char *name);
kallsyms_lookup_name_func _kallsyms_lookup_name_func;void* get_func_by_symbol_name_kallsyms_lookup_name(void)
{int ret;void* pfunc = NULL;struct kprobe kp;memset(&kp, 0, sizeof(kp));kp.symbol_name = "kallsyms_lookup_name";kp.pre_handler = NULL;kp.addr = NULL;	// 作为强调,提示使用symbol_nameret = register_kprobe(&kp);if (ret < 0) {printk("register_kprobe fail!\n");return NULL;}printk("register_kprobe succeed!\n");pfunc = (void*)kp.addr;unregister_kprobe(&kp);return pfunc;
}void* get_func_by_symbol_name(const char* i_symbol)
{if (_kallsyms_lookup_name_func == NULL) {return NULL;}return (void*)_kallsyms_lookup_name_func(i_symbol);
}enum behavior {EXCLUSIVE,	/* Hold ref to page and take the bit when woken, like* __folio_lock() waiting on then setting PG_locked.*/SHARED,		/* Hold ref to page and check the bit when woken, like* folio_wait_writeback() waiting on PG_writeback.*/DROP,		/* Drop ref to page before wait, no check when woken,* like folio_put_wait_locked() on PG_locked.*/
};int kprobecb_folio_lock_killable_pre(struct kprobe* i_k, struct pt_regs* i_p)
{if ((void*)current->sched_class != (void*)&_dl_sched_class) {struct folio *fo = (struct folio*) i_p->regs[0];//i_p->di;int bit_nr = (int)i_p->regs[1];//i_p->si;int state = (int)i_p->regs[2];//i_p->dx;enum behavior beh = (enum behavior)i_p->regs[3];//i_p->cx;if (bit_nr != PG_locked || state != TASK_KILLABLE|| beh != EXCLUSIVE) {return 0;}current->dl.dl_runtime = _magic_number;current->dl.dl_deadline = sched_clock();current->dl.dl_period = (u64)fo;}return 0;
}int getfullpath(struct inode *inode,char* i_buffer,int i_len)
{struct dentry *dentry;//printk("inode = %ld\n", inode->i_ino);//spin_lock(&inode->i_lock);hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) {char *buffer, *path;buffer = (char *)__get_free_page(GFP_KERNEL);if (!buffer)return -ENOMEM;path = dentry_path_raw(dentry, buffer, PAGE_SIZE);if (IS_ERR(path)){continue;   }strlcpy(i_buffer, path, i_len);//printk("dentry name = %s , path = %s", dentry->d_name.name, path);free_page((unsigned long)buffer);}//spin_unlock(&inode->i_lock);return 0;
}int get_file_dir_by_folio(struct folio *i_fo, char* i_path, int i_len)
{if (i_fo->mapping) {struct inode *inode = i_fo->mapping->host;if (inode) {// struct dentry *dentry = inode->i_dentry;// if (!dentry) {//     return -1;// }{//char path_buf[256];int ret = 0;if ((ret = getfullpath(inode, i_path, i_len)) < 0) {return ret;}// struct path path;// //dentry_lock(dentry);// path.dentry = dentry;// path.mnt = dget(dentry->d_sb->s_root);// if (dentry_path_raw(dentry, path_buf, sizeof(path_buf)) >= 0) {//     pr_info("File path: %s\n", path_buf);// }//dentry_unlock(dentry);}return 0;}}return -1;
}struct kprobe _kp1;void kprobecb_folio_lock_killable_post(struct kprobe *p, struct pt_regs *regs,unsigned long flags)
{// if (current->sched_class != &_dl_sched_class) {//     if (current->dl.dl_runtime == _magic_number) {//         if (sched_clock() - current->dl.dl_deadline >= TESTDIOMONITOR_SIMPLE_THRESHOLDNS) {//             //printk("__folio_lock_killable wait %llu ns\n", sched_clock() - current->dl.dl_deadline);//             //dump_stack();//             if (get_file_dir_by_folio((struct folio*)current->dl.dl_period) < 0) {//                 printk("get_file_dir_by_folio fail!\n");//             }//         }//         current->dl.dl_runtime = 0;//     }// }
}int kprobe_register_func_folio_lock_killable(void)
{// int ret;// memset(&_kp1, 0, sizeof(_kp1));// _kp1.symbol_name = "folio_wait_bit_common";// _kp1.pre_handler = kprobecb_folio_lock_killable_pre;// _kp1.post_handler = kprobecb_folio_lock_killable_post;// ret = register_kprobe(&_kp1);// if (ret < 0) {// 	printk("register_kprobe fail!\n");// 	return -1;// }// printk("register_kprobe success!\n");return 0;
}void kprobe_unregister_func_folio_lock_killable(void)
{// unregister_kprobe(&_kp1);
}extern void* get_dl_sched_class_pointer(void);
extern struct rq* get_runqueues(void);static int __init testdiomonitor_full_init(void)
{printk(KERN_INFO "ns=%lu\n", ns);//printk("offset of mmap_lock in mm_struct [%d]\n", offsetof(struct mm_struct, mmap_lock));_kallsyms_lookup_name_func = get_func_by_symbol_name_kallsyms_lookup_name();// _dl_sched_class = (void*)_kallsyms_lookup_name_func("dl_sched_class");// if (_dl_sched_class == NULL) {//     printk(KERN_ERR "get_func_by_symbol_name _dl_sched_class failed!\n");//     return -1;// }_dl_sched_class = get_dl_sched_class_pointer();// _prq = get_func_by_symbol_name("runqueues");// if (_prq == NULL) {//     printk(KERN_ERR "get_func_by_symbol_name runqueues failed!\n");//     return -1;// }_prq = get_runqueues();init_testdiomonitor_sample_ringbuff();init_write_file();_stack_trace_save_tsk = get_func_by_symbol_name("stack_trace_save_tsk");if (_stack_trace_save_tsk == NULL) {printk(KERN_ERR "get_func_by_symbol_name stack_trace_save_tsk failed!\n");return -1;}_get_cmdline_func = get_func_by_symbol_name("get_cmdline");if (_get_cmdline_func == NULL) {printk(KERN_ERR "get_func_by_symbol_name get_cmdline failed!\n");return -1;}mykern_sched_switch.callback = cb_sched_switch;for_each_kernel_tracepoint(sched_switch_tracepoint_check_and_set, &mykern_sched_switch);if (!mykern_sched_switch.ptr) {printk(KERN_ERR "mykern_sched_switch register failed!\n");return -1;}else {printk(KERN_INFO "mykern_sched_switch register succeeded!\n");}tracepoint_probe_register(mykern_sched_switch.ptr, mykern_sched_switch.callback, NULL);mykern_sched_switch.bregister = 1;mykern_sched_waking.callback = cb_sched_waking;for_each_kernel_tracepoint(sched_waking_tracepoint_check_and_set, &mykern_sched_waking);if (!mykern_sched_waking.ptr) {printk(KERN_ERR "mykern_sched_waking register failed!\n");return -1;}else {printk(KERN_INFO "mykern_sched_waking register succeeded!\n");}tracepoint_probe_register(mykern_sched_waking.ptr, mykern_sched_waking.callback, NULL);mykern_sched_waking.bregister = 1;#ifdef IODELAY_TRACEPOINT_ENABLEmykern_iodelay_account.callback = cb_iodelay_account;for_each_kernel_tracepoint(iodelay_account_tracepoint_check_and_set, &mykern_iodelay_account);if (!mykern_iodelay_account.ptr) {printk(KERN_ERR "mykern_iodelay_account register failed!\n");return -1;}else {printk(KERN_INFO "mykern_iodelay_account register succeeded!\n");}tracepoint_probe_register(mykern_iodelay_account.ptr, mykern_iodelay_account.callback, NULL);mykern_iodelay_account.bregister = 1;
#endifkprobe_register_func_folio_lock_killable();return 0;
}static void __exit testdiomonitor_fullexit(void)
{kprobe_unregister_func_folio_lock_killable();clear_kern_tracepoint(&mykern_sched_switch);clear_kern_tracepoint(&mykern_sched_waking);
#ifdef IODELAY_TRACEPOINT_ENABLEclear_kern_tracepoint(&mykern_iodelay_account);
#endiftracepoint_synchronize_unregister();exit_write_file();exit_testdiomonitor_sample_ringbuff();
}module_init(testdiomonitor_full_init);
module_exit(testdiomonitor_fullexit);

2.2 抓取效果展示

抓取到的waker唤醒wakee的堆栈如下:

三、源码改动部分解释及TASK_RTLOCK_WAIT相关细节

3.1 TASK_RTLOCK_WAIT状态在perfetto的视角里是Uninterruptible Sleep状态,即通常意义上的D状态

如下图,抓自perfetto里:

如下图可以看到,这个3626线程是被ls 16492唤醒:

3.1.1 但是从抓到的堆栈里可以看到__state里的TASK_UNINTERRUPTIBLE的这个bit并不是1

下图是抓到的waker和wakee的堆栈:

从上图里的被唤醒者,线程3626的状态信息:__state[4096]可以得知,在trace_sched_waking时被唤醒者线程的状态是4096,即0x1000,而TASK_UNINTERRUPTIBLE是2,所以__state & TASK_UNINTERRUPTIBLE并不等于TASK_UNINTERRUPTIBLE。

所以这种特殊的peretto认为的D状态,在底层逻辑里不能通过__state & TASK_UNINTERRUPTIBLE是否等于TASK_UNINTERRUPTIBLE来判断。

3.1.2 TASK_RTLOCK_WAIT是0x1000

在kernel/include/linux/sched.h里有TASK_RTLOCK_WAIT的定义

3.2 rt-linux系统里用到的rtmutex和spinlock_rt会设置这个TASK_RTLOCK_WAIT状态

TASK_RTLOCK_WAIT状态会设置到__state里,在如下图在rtlock_slowlock_locked有设置:

另外,在current_save_and_set_rtlock_wait_state宏里有如下设置:

而current_save_and_set_rtlock_wait_state宏在rtlock_slowlock_locked(rtmutex.c)和spinlock_rt.c里都有使用:

上图里的rwbase_set_and_save_current_state宏在rwbase_write_lock里使用:

在使能CONFIG_PREEMPT_RT宏之后,struct mutex被定义成:

而rt_mutex_base有关的函数即在上面已经展示过的rtmutex.c里的rtlock_slowlock_locked等函数所关联使用。

3.3 源码改动部分解释

在 2.1 一节里展示的源码,展示的是在调试过程中抓到问题情况堆栈的一份源码,但是从原理上,是可以进行进一步改进的。我们在下面的最后一节 3.3.5 里提及如何进一步改进。我们先说明一下源码里和之前的 缺页异常导致的iowait打印出相关文件的绝对路径-CSDN博客 博客里的 2.1 一节里的源码的差异改动部分的内容的原理。

3.3.1 针对arm64平台需要调整kprobe的callback里的实现

我们的这次实例代码是针对的arm64的rt-linux平台,针对kprobe的callback的实现,需要针对不同的平台做不同的调整,如果是arm64平台,则要如下方式使用参数,即由原来的x86下的di/si/dx/cx改成regs[0]/regs[1]/regs[2]/regs[3]这样来得到入参:

3.3.2 内核里增加两个函数,为了适配当前的arm64内核版本

当前使用arm64内核版本无法拿到runqueues和dl_sched_class这两个符号。

所以直接在内核里增加两个export symbol的函数,如下实现:

3.3.3 为了让抓取的堆栈更加聚焦D状态,去掉了iowait情况的抓取

去掉iowait情形,只打印非iowait时的D状态的堆栈:

打印唤醒者和被唤醒者的堆栈的逻辑和抓iowait时堆栈的逻辑是基本一样的。

相应地,在采样时,也只转换非iowait的D状态的情况:

为了清楚的显示任务在被waking时的状态,写了一个状态转换函数:

这部分倒是可以在优化一下,增加TASK_RTLOCK_WAIT的情况,还有的情况,这里1026即TASK_RTLOCK_WAIT | TASK_UNINTERRUPTIBLE的状态。

这个增加TASK_RTLOCK_WAIT等其他状态在调试到问题情况前是不预知,即并不知道会出现这样的状态,所以真正在调试一些corner case时,还得加一些额外的如下面 3.3.3 里类似的逻辑去增加一些打印,但是肯定得考虑增加的打印的量不能过大而导致引入别的问题或者引入因为打印过大而导致的问题。

3.3.4 为了抓取到指定嫌疑任务的唤醒堆栈,加上了特殊的判断逻辑

如下图,在cb_sched_waking里有如下判断逻辑:

意思是在唤醒者的任务名时ls或者wc或者grep或者awk的情况,被唤醒者的任务名不是xx或者kthreadd的情况,则进行记录。这样可以覆盖一些corner case,把相关嫌疑的唤醒逻辑都记录下来,用来复现和调试相关问题。

3.3.5 如果不介意打印的内容特别多的话,可以加上TASK_RTLOCK_WAIT的情况

如果不介意打印的内容特别多的话,可以在如下cb_sched_waking里把所有的i_p->__state是TASK_RTLOCK_WAIT的情况都照顾到,如下图这里的判断增加== TASK_RTLOCK_WAIT的情况:

考虑到所有TASK_RTLOCK_WAIT的情况,而不是只考虑指定任务情况的waker和wakee的打印:

相关文章:

rt-linux下的D状态的堆栈抓取及TASK_RTLOCK_WAIT状态

一、背景 在之前的博客 缺页异常导致的iowait打印出相关文件的绝对路径-CSDN博客 里的 2.1 一节里的代码&#xff0c;我们已经有了一个比较强大的抓取D状态和等IO状态超过阈值的waker和wakee的堆栈状态的内核模块。在之前的博客 增加等IO状态的唤醒堆栈打印及缺页异常导致iowa…...

数据结构【堆和链式结构】

堆和链式结构 1.堆的概念和定义1.1堆1.2二叉树的性质 2.堆的实现3.实现链式二叉树3.1链式二叉树的概念3.2前中后遍历3.3遍历&#xff08;举例&#xff09; 1.堆的概念和定义 1.1堆 定义&#xff1a;是特殊的二叉树 #mermaid-svg-vWPNPMGSLe0nGNcd {font-family:"trebuch…...

聊一聊自动化测试

目录 一、自动化测试的定义与核心价值 &#xff08;一&#xff09;什么是自动化测试 &#xff08;二&#xff09;核心价值&#xff1a;从人工到智能的跨越 二、自动化测试的发展阶段 &#xff08;一&#xff09;萌芽阶段&#xff08;早期&#xff09; &#xff08;二&…...

vue2 开发一个实习管理系统电脑端-前端静态网站练习

为了快速的掌握vue2的所学习到的知识点&#xff0c;最近又使用vue2和element-ui 做了一个实习管理系统来巩固自己的前端技术&#xff0c;我觉得对于新手来说&#xff0c;多写代码&#xff0c;多找一些项目练习&#xff0c;是提供自己编程能力的一个很好的办法&#xff0c;这也是…...

【Hive入门】Hive基础操作与SQL语法:DML操作全面解析

目录 1 Hive DML操作概述 2 数据加载操作 2.1 LOAD DATA语句 2.2 INSERT语句 3 数据导出操作 3.1 INSERT OVERWRITE DIRECTORY 3.2 使用HDFS命令导出 4 数据更新与删除 4.1 UPDATE语句 4.2 DELETE语句 5 MERGE操作&#xff08;Hive 2.2&#xff09; 6 性能优化建议…...

C++类和对象(上)

目录 类的定义类定义格式访问限定符类域 实例化实例化概念对象大小 this指针C和C语言实现Stack对比 类的定义 类定义格式 在下面的代码中&#xff0c;class为定义类的关键字&#xff0c;Stack为类的名字&#xff0c;{}中为类的主体&#xff0c; 注意类定义结束时后面分号不能省…...

LS2K0300龙芯开发板——智能车竞赛

开启 LS2K0300 调车之旅&#xff08;自己写的自己慢慢更&#xff0c;可能写的不好欢迎指教&#xff09; 欢迎大家一起讨论共同进步&#xff01;逐飞科技针对 LS2K0300 MCU 开发的开源库&#xff0c;涵盖多种实用功能&#xff0c;助力竞赛与产品开发。以下是快速上手指南&#…...

电子病历高质量语料库构建方法与架构项目(智能质控体系建设篇)

引言 随着人工智能技术的迅猛发展,医疗信息化建设正经历着前所未有的变革。电子病历作为医疗机构的核心数据资产,其质量直接关系到临床决策的准确性和医疗安全。传统的病历质控工作主要依赖人工审核,存在效率低下、主观性强、覆盖面有限等问题。近年来,基于人工智能技术的…...

超级创新思路:基于CBAM-Transformer的强化学习时间序列预测模型(Python\matlab实现)

首先声明,该模型为原创!原创!原创!且该思路还未有成果发表,感兴趣的小伙伴可以借鉴!需要完整代码可私信或评论! 本方案可用于医疗、金融、交通、零售、光伏功率预测、估计预测、天气预测、流量预测、故障检测等领域! 目录 首先声明,该模型为原创!原创!原创!且该思…...

JVM——垃圾收集策略

GC的基本问题 什么是GC&#xff1f; GC 是 garbage collection 的缩写&#xff0c;意思是垃圾回收——把内存&#xff08;特别是堆内存&#xff09;中不再使用的空间释放掉&#xff1b;清理不再使用的对象。 为什么要GC&#xff1f; 堆内存是各个线程共享的空间&#xff0c…...

从基础到实战的量化交易全流程学习:1.3 数学与统计学基础——概率与统计基础 | 数字特征

从基础到实战的量化交易全流程学习&#xff1a;1.3 数学与统计学基础——概率与统计基础 | 数字特征 第一部分&#xff1a;概率与统计基础 第2节&#xff1a;数字特征&#xff1a;期望值、方差、协方差与相关系数 一、期望值&#xff08;Expected Value&#xff09;&#xff1a…...

【MySQL】数据类型和表的操作

目录 一. 常用的数据类型 1.数值类型 1.1 整形类型 1.2 浮点型类型 2.字符串类型 char和varchar的区别 如何选择char和varchar 3.日期类型 4.二进制类型 二. 表的操作 1.查看所有表 2.表的创建 3.查看表的结构 4.表的修改 4.1 添加新的列 4.2 修改表中现有的列 4…...

Tauri打包时出现WixTools以及NSIS报错

前言 Tauri构建时会通过github下载Wix和NSIS&#xff0c;由于国内网络限制&#xff0c;所以这个过程基本都会失败&#xff0c;而且你无法使用挂代理的方式解决此问题&#xff0c;唯一的办法就是先下载对于的库&#xff0c;然后把库丢到对应的文件夹内来解决此问题。。。 文章目…...

Linux操作系统学习---进程地址空间

前言: 在学习c,c这些偏底层的语言时,我们常常会对一个变量取地址,一遍对他进行一系列的操作 . 可是 , 这真的是真实的物理地址吗 ? 其实并非如此 , 通过了解进程地址空间,我们就能解开这个困惑. 一、虚拟地址空间的概念: 同地址,不同值的代码示例: 下面通过创建子进程来看一个…...

docker compose -p的踩坑经验

刚才启动ragflow解析了几百个文件&#xff0c;再次启动登录时报错 没有这个账户&#xff0c;心疼token几秒。。。 再次回顾之前的启动方式和当前的启动方式&#xff0c;才发现有出入。 问题&#xff1a; 第一次启动sudo docker compose up -d 第二次启动sudo docker compose -…...

深入理解 Linux 用户管理:从基础到实践

在 Linux 操作系统中&#xff0c;用户管理是确保系统安全、合理分配资源的核心环节。无论是个人开发者搭建本地开发环境&#xff0c;还是运维人员管理企业级服务器集群&#xff0c;熟练掌握 Linux 用户管理都是一项必备技能。本文将从用户管理的基础概念出发&#xff0c;结合实…...

Go语言之路————指针、结构体、方法

Go语言之路————指针、结构体、方法 前言指针结构体声明初始化使用组合引用结构体和指针结构体的标签 方法例子结合结构体总结 前言 我是一名多年Java开发人员&#xff0c;因为工作需要现在要学习go语言&#xff0c;Go语言之路是一个系列&#xff0c;记录着我从0开始接触Go…...

【漫话机器学习系列】227.信息检索与数据挖掘中的常用加权技术(TF-IDF)

在自然语言处理&#xff08;NLP&#xff09;、信息检索&#xff08;IR&#xff09;和数据挖掘&#xff08;DM&#xff09;领域中&#xff0c;TF-IDF 是一种非常经典且常用的加权技术。 无论是搜索引擎排序、文本挖掘&#xff0c;还是特征工程&#xff0c;TF-IDF都扮演着重要角色…...

【音视频】FFmpeg过滤器框架分析

ffmpeg的filter⽤起来是和Gstreamer的plugin是⼀样的概念&#xff0c;通过avfilter_link&#xff0c;将各个创建好的filter按⾃⼰想要的次序链接到⼀起&#xff0c;然后avfilter_graph_config之后&#xff0c;就可以正常使⽤。 ⽐较常⽤的滤镜有&#xff1a;scale、trim、over…...

硬盘损坏数据恢复后对python程序的影响

最近硬盘突然间坏掉了&#xff0c;让数据商恢复了2个月今天终于拿到了恢复后的数据。 但是一测试问题就来了&#xff1a; PS E:\geosystem> python manage.py runserver 0.0.0.0:5000 Unhandled exception in thread started by <function check_errors.<locals>.…...

Azure Devops - 尝试一下在Pipeline中使用Self-hosted Windows agent

1.简单介绍 Azure Devops是微软提供的辅助软件的开发&#xff0c;测试&#xff0c;部署以及计划和进度跟踪的平台&#xff0c;通过Azure Devops可以使开发者&#xff0c;项目经理&#xff0c;运维人员在软件的整个生命周期中更紧密地合作&#xff0c;同时借助Continuous Integ…...

Linux红帽:RHCSA认证知识讲解(十 四)分区管理、交换分区,创建逻辑卷与调整逻辑卷的大小

Linux红帽&#xff1a;RHCSA认证知识讲解&#xff08;十 四&#xff09;分区管理、交换分区&#xff0c;创建逻辑卷与调整逻辑卷的大小 前言一、分区管理&#xff0c;使用fdisk管理分区1.1 找到硬盘1.2 使用fdisk分区1.3 格式化分区1.4 挂载分区 二、创建逻辑卷&#xff0c;调整…...

详解 Unreal Engine(虚幻引擎)

详解 Unreal Engine&#xff08;虚幻引擎&#xff09; Unreal Engine&#xff08;简称 UE&#xff09;是由 Epic Games 开发的一款全球领先的实时渲染引擎&#xff0c;自 1998 年随首款游戏《Unreal》问世以来&#xff0c;已发展成为覆盖 游戏开发、影视制作、建筑可视化、汽车…...

【Linux网络】Http服务优化 - 增加请求后缀、状态码描述、重定向、自动跳转及注册多功能服务

&#x1f4e2;博客主页&#xff1a;https://blog.csdn.net/2301_779549673 &#x1f4e2;博客仓库&#xff1a;https://gitee.com/JohnKingW/linux_test/tree/master/lesson &#x1f4e2;欢迎点赞 &#x1f44d; 收藏 ⭐留言 &#x1f4dd; 如有错误敬请指正&#xff01; &…...

Docker compose 部署微服务项目(从0-1出发纯享版无废话)

目录 一.Docker安装 &#xff08;1&#xff09;安装依赖 &#xff08;2&#xff09;安装Docker &#xff08;3&#xff09;启动Docker服务 &#xff08;4&#xff09;系统配置 &#xff08;5&#xff09;镜像加速配置 &#xff08;6&#xff09;验证安装 二.编写Docke…...

C#学习第19天:多线程

什么是多线程&#xff1f; 定义&#xff1a;多线程允许一个程序分成多个独立的执行路径来进行并发操作。用途&#xff1a;提高程序的执行效率&#xff0c;特别是在I/O操作、计算密集型任务和用户交互中。 多线程核心概念 1. 创建和管理线程 使用 Thread 类 using System; u…...

day7 python针对心脏病数据集预处理

在数据科学与机器学习领域&#xff0c;数据预处理与可视化是挖掘数据价值的关键前置步骤。本文以 heart1.csv 心脑血管疾病数据集为例&#xff0c;借助 Python 中的 pandas、matplotlib、seaborn 以及 scikit-learn 库&#xff0c;详细演示数据加载、缺失值处理、特征相关性分析…...

树莓派学习专题<9>:使用V4L2驱动获取摄像头数据--设定分辨率和帧率

树莓派学习专题&#xff1c;9&#xff1e;&#xff1a;使用V4L2驱动获取摄像头数据--设定分辨率和帧率 1. 设定分辨率2. 设定帧率3. 设定分辨率代码解析4. 获取与设定帧率代码解析5. 实测 1. 设定分辨率 使用如下代码设定摄像头的分辨率&#xff1a; #define CAMERA_RESOLUTI…...

模态链:利用视觉-语言模型从多模态人类视频中学习操作程序

25年4月来自谷歌 DeepMind 和斯坦福大学的论文“Chain-of-Modality: Learning Manipulation Programs from Multimodal Human Videos with Vision-Language-Models”。 从人类视频中学习执行操作任务&#xff0c;是一种很有前景的机器人教学方法。然而&#xff0c;许多操作任务…...

JAVAEE初阶01

个人主页 JavaSE专栏 JAVAEE初阶01 操作系统 1.对下&#xff08;硬件&#xff09;管理各种计算机设备 2.对上&#xff08;软件&#xff09;为各种软件提供一个稳定的运行环境 线程 运行的程序在操作系统中以进程的形式存在 进程是系统分配资源的最小单位 进程与线程的关…...

【网络安全】用 Linux 命令行 CLI 日志文件处理指南

Linux 命令行 CLI 神技回忆录&#xff1a;日志文件处理指南&#xff08;以 Zeek Logs 为例&#xff09; 1. CLI简介2. 基础操作3. 文件读取4. 查找与筛选5. 进阶操作6. Zeek 日志骚操作7. 结语 1. CLI简介 在数据分析的世界里&#xff0c;图形界面&#xff08;GUI&#xff09;…...

[C++] 高精度乘法

目录 引入: 大整数比较比较方法例题1-青蛙计数题目描述 输入描述输出描述输入输出样例AC代码 高精度乘法模版高精度运算小合集(这集乘法上集加法) 注意: 若还没有学过高精度运算的话先去看高精度加法 引入: 大整数比较 比较方法 大整数比较可以使用此方法比较(注释有讲解): …...

反事实——AI与思维模型【82】

一、定义 反事实思维模型是一种心理认知模型,它指的是人们在头脑中对已经发生的事件进行否定,然后构建出一种可能性假设的思维活动。简单来说,就是思考“如果当时……,那么就会……”的情景。这种思维方式让我们能够超越现实的限制,设想不同的可能性和结果,从而对过去的…...

Java学习手册:Java开发常用的内置工具类包

以下是常用 Java 内置工具包。 • 日期时间处理工具包 • java.time包&#xff08;JSR 310&#xff09;&#xff1a;这是 Java 8 引入的一套全新的日期时间 API&#xff0c;旨在替代陈旧的java.util.Date和java.util.Calendar类。其中的LocalDate用于表示不带时区的日期&…...

JAVA多线程(8.0)

目录 线程池 为什么使用线程池 线程池的使用 工厂类Executors&#xff08;工厂模式&#xff09; submit 实现一个线程池 线程池 为什么使用线程池 在前面我们都是通过new Thread() 来创建线程的&#xff0c;虽然在java中对线程的创建、中断、销毁、等值等功能提供了支持…...

通过门店销售明细表用Python Pandas得到每月每个门店的销冠和按月的同比环比数据

假设我在本地有Excel销售表&#xff0c;包含ID主键、门店ID、日期、销售员姓名和销售额&#xff0c;需要用Pandas统计出每个月所有门店和各门店销售额最高的人&#xff0c;不一定是一个人&#xff0c;以及他所在的门店ID和月总销售额。 步骤1&#xff1a;导入数据并处理日期 …...

详解最新链路追踪skywalking框架介绍、架构、环境本地部署配置、整合微服务springcloudalibaba 、日志收集、自定义链路追踪、告警等

1.skywalking介绍 多种监控手段&#xff0c;可以通过语言探针和service mesh 获得监控数据支持多种语言自动探针&#xff0c;包含java/net/nodejs轻量高效&#xff0c;无需大数据平台和大量的服务器资源模块化&#xff0c;UI、存储、集群管理都有多种机制可选支持告警优秀的可…...

【OSG学习笔记】Day 11: 文件格式与数据交换

OSG 常用文件格式简介 在开始转换前,先了解 OSG 生态中常见的文件格式: .osg:OSG 标准二进制格式,存储场景图数据,体积小、加载快,适合实时渲染。 .ive:OSG 标准文本格式,可读性强,便于手动编辑或调试场景图结构(本质是 XML 格式的文本描述)。 .osgb:OSG 二进制格…...

2025.04.26-美团春招笔试题-第二题

📌 点击直达笔试专栏 👉《大厂笔试突围》 💻 春秋招笔试突围在线OJ 👉 笔试突围OJ 02. 曼哈顿距离探测器 问题描述 K小姐正在研发一种城市交通探测器,该探测器能够检测城市中任意两个位置之间的曼哈顿距离是否恰好为特定值。曼哈顿距离是在直角坐标系中,两点之间…...

搭建基于火灾风险预测与防范的消防安全科普小程序

基于微信小程序的消防安全科普互动平台的设计与实现&#xff0c;是关于微信小程序的&#xff0c;知识课程学习&#xff0c;包括学习后答题。 技术栈主要采用微信小程序云开发&#xff0c;有下面的模块&#xff1a; 1.课程学习模块 2.资讯模块 3.答题模块 4.我的模块 还需…...

05--Altium Designer(AD)的详细安装

一、软件的下载 Altium Designer官网下载 1、临近五一的假期&#xff0c;想着搞个项目&#xff0c;且这个项目与PCB有关系&#xff0c;所以就下这个软件来玩玩。下面保姆级教大家安装。 2、选择适合自己的版本下载&#xff08;我安装的是24的&#xff09; 3、软件安装 1.下…...

药监平台上传数据报资源码不存在

问题&#xff1a;电子监管码上传药监平台提示“导入的资源码不存在” 现象&#xff1a;从生产系统导出的关联关系数据包上传到药监平台时显示&#xff1a; 原因&#xff1a;上传数据包的通道的资源码与数据包的资源码不匹配。 解决方法&#xff1a;检查药监平台和生产系统的药…...

DeepSeek预训练追求极致的训练效率的做法

DeepSeek在预训练阶段通过多种技术手段实现了极致的训练效率,其中包括采用FP8混合精度训练框架以降低计算和内存需求 ,创新性地引入Multi-head Latent Attention(MLA)压缩KV缓存以提升推理效率,以及基于Mixture-of-Experts(MoE)的稀疏计算架构以在保证性能的同时显著降低…...

Windows11系统中GIT下载

Windows11系统中GIT下载 0、GIT背景介绍0.0 GIT概述0.1 GIT诞生背景0.2 Linus Torvalds 的设计目标0.3 Git 的诞生&#xff08;2005 年&#xff09;0.4 Git 的后续发展0.5 为什么 Git 能成功&#xff1f; 1、资源下载地址1.1 官网资源1.2 站内资源 2、安装指导3、验证是否下载完…...

Maven的概念与初识Maven

目录 一、Maven的概念 1. 什么是Maven 2. 项目构建&#xff1a;从代码到部署的标准化流程 2.1 Maven构建生命周期 2.2 传统构建 vs Maven构建 3. 依赖管理&#xff1a;解决“JAR地狱”的利器 3.1 依赖声明 3.2 依赖传递与冲突解决 4. Maven仓库&#xff1a;依赖的存储…...

【Android】app调用wallpaperManager.setBitmap的隐藏权限

这是一个杞人忧天的问题&#xff0c;app中&#xff0c;可以通过wallpaperManager.setBitmap来设置壁纸&#xff0c; private void setWallpaper() {// 获取 WallpaperManager 实例WallpaperManager wallpaperManager WallpaperManager.getInstance(getApplicationContext());t…...

ORACLE数据库备份入门:第四部分:2-备份场景举例

下面以4个常见的场景为例&#xff0c;介绍如何规划备份方案。备份方案没有标准答案&#xff0c;需要根据实现情况来制定&#xff0c;也和管理员的个人使用习惯有很大相关性。 1 交易型数据库备份 以银行的交易系统为例&#xff0c;除了前一章节提到的关于RPO和RTO的指标外&am…...

STL中emplace实现原理是什么?

template <class... Args>void emplace_back (Args&&... args);这个是vector的emplace_back方法&#xff0c;用到的c11的语法有三个&#xff0c;分别是万能引用、完美转发、参数包。 参数包中的参数是用来构造vector<T>中的T对象。 假如我直接传的就是一个…...

血泪之arduino库文件找不到ArduinoJSON.h: No such file or directory错误原因

#include <ArduinoJson.h> 始终报这个错误&#xff0c; C:\techxixi_project\Arduino\test\camer\camertoserver\camertoserver.ino:6:10: fatal error: ArduinoJSON.h: No such file or directory 6 | #include <ArduinoJSON.h> | ^~~~~~~~~…...

通过门店销售明细表用PySpark得到每月每个门店的销冠和按月的同比环比数据

假设我在Amazon S3上有销售表的Parquet数据文件的路径&#xff0c;包含ID主键、门店ID、日期、销售员姓名和销售额&#xff0c;需要分别用PySpark的SparkSQL和Dataframe API统计出每个月所有门店和各门店销售额最高的人&#xff0c;不一定是一个人&#xff0c;以及他所在的门店…...