libclues
Linux C++ Tracing Library
Loading...
Searching...
No Matches
Engine.cxx
1// clues
2#include <clues/AutoAttachedTracee.hxx>
3#include <clues/ChildTracee.hxx>
4#include <clues/Engine.hxx>
5#include <clues/EventConsumer.hxx>
6#include <clues/ForeignTracee.hxx>
7#include <clues/format.hxx>
8#include <clues/logger.hxx>
9
10// cosmos
11#include <cosmos/error/ApiError.hxx>
12#include <cosmos/error/InternalError.hxx>
13#include <cosmos/io/ILogger.hxx>
14#include <cosmos/proc/process.hxx>
15
16namespace clues {
17
19 if (!m_tracees.empty()) {
20 try {
21 stop(cosmos::signal::KILL);
22 trace();
23 } catch (const std::exception &ex) {
24 LOG_WARN("Trying to stop remaining tracess in ~Engine():" << ex.what());
25
26 if (!m_tracees.empty()) {
27 LOG_ERROR("Failed to cleanup Engine");
28 std::abort();
29 }
30 }
31 }
32
33 for (auto &pair: m_unknown_events) {
34 LOG_WARN("Unknown event for PID " << cosmos::to_integral(pair.first) << " left unprocessed");
35 }
36}
37
38TraceePtr Engine::addTracee(const cosmos::ProcessID pid, const FollowChildren follow_children,
39 const AttachThreads attach_threads, const cosmos::ProcessID sibling) {
40 TraceePtr sibling_ptr;
41 if (auto it = m_tracees.find(sibling); it != m_tracees.end()) {
42 sibling_ptr = it->second;
43 }
44 auto tracee = std::make_shared<ForeignTracee>(*this, m_consumer, sibling_ptr);
45 tracee->configure(pid);
46 tracee->attach(follow_children, attach_threads);
47 m_tracees[pid] = tracee;
48 return tracee;
49}
50
51TraceePtr Engine::addTracee(const cosmos::StringVector &cmdline, const FollowChildren follow_children) {
52 auto tracee = std::make_shared<ChildTracee>(*this, m_consumer);
53 tracee->create(cmdline);
54 tracee->attach(follow_children);
55 m_tracees[tracee->pid()] = tracee;
56 return tracee;
57}
58
59void Engine::checkCleanupTracee(TraceeMap::iterator it) {
60 auto &tracee = *it->second;
61
62 if (tracee.state() == Tracee::State::DETACHED) {
63 if (!tracee.isChildProcess()) {
64 m_tracees.erase(it);
65 }
66 } else if (tracee.state() == Tracee::State::DEAD) {
67 tracee.doDetach();
68 m_tracees.erase(it);
69 }
70}
71
72void Engine::checkUnknownEvents() {
73 if (m_newly_attached_pid == cosmos::ProcessID::INVALID)
74 return;
75
76 const auto new_pid = m_newly_attached_pid;
77 m_newly_attached_pid = cosmos::ProcessID::INVALID;
78
79 if (auto it = m_unknown_events.find(new_pid); it != m_unknown_events.end()) {
80 if (handleEvent(it->second) != Decision::DONE) {
81 // this should not cause any special outcomes anymore
82 LOG_WARN("delayed handling of unknown event yielded unexpected decision");
83 }
84 m_unknown_events.erase(it);
85 }
86}
87
88void Engine::handleNoChildren() {
89 for (auto it = m_tracees.begin(); it != m_tracees.end(); it++) {
90 auto &tracee = it->second;
91
92 if (tracee->flags()[Tracee::Flag::WAIT_FOR_EXITED]) {
93 /*
94 * This can be observed with the main thread when
95 * another thread (which isn't traced) calls execve().
96 *
97 * For this situation no EXITED wait() status is
98 * reported for the main thread, only PTHREAD_EXIT is
99 * seen. After that we can ECHLD with wait() and ESRCH
100 * with ptrace().
101 */
102 LOG_DEBUG("Tracee "
103 << cosmos::to_integral(tracee->pid())
104 << " likely disappeared because of execve() in another thread");
105 } else {
106 LOG_WARN("Tracee " << cosmos::to_integral(tracee->pid())
107 << " suddenly lost?!");
108 }
109
110 // actually we already seem to be detached, but this function
111 // also takes care of this case and properly resets object
112 // state
113 tracee->detach();
114
115 checkCleanupTracee(it);
116 }
117}
118
120 cosmos::ChildState data;
121
122 while (!m_tracees.empty()) {
123 try {
124 data = *cosmos::proc::wait(cosmos::WaitFlags{
125 cosmos::WaitFlag::WAIT_FOR_EXITED,
126 cosmos::WaitFlag::WAIT_FOR_STOPPED});
127 } catch (const cosmos::ApiError &ex) {
128 if (ex.errnum() == cosmos::Errno::NO_CHILD) {
129 handleNoChildren();
130 return;
131 }
132
133 throw;
134 }
135
136 while (true) {
137 if (const auto decision = handleEvent(data); decision == Decision::RETRY) {
138 // retry if we can process the event the next time.
139 continue;
140 } else if (decision == Decision::STORE) {
141 const auto res = m_unknown_events.insert(
142 std::make_pair(data.child.pid, std::move(data)));
143 if (!res.second) {
144 // we only expect one unknown event to appear per PID (PTRACE_EVENT_STOP)
145 LOG_WARN("additional unknown trace event for PID " <<
146 cosmos::to_integral(data.child.pid));
147 }
148 } else if (decision == Decision::DROP) {
149 LOG_WARN("received unknown trace event " << format::event(data));
150 }
151
152 break;
153 }
154 }
155}
156
157Engine::Decision Engine::handleEvent(const cosmos::ChildState &data) {
158 if (auto it = m_tracees.find(data.child.pid); it != m_tracees.end()) {
159 Tracee &tracee = *it->second;
160 try {
161 tracee.processEvent(data);
162 } catch (const cosmos::ApiError &ex) {
163 auto pid = cosmos::to_integral(tracee.pid());
164 if (ex.errnum() == cosmos::Errno::SEARCH) {
165 /*
166 * this can happen when the process was killed in the
167 * meantime, or in a multi-threaded process when
168 * another thread called execve() in parallel.
169 *
170 * we _should_ still receive an exit notification
171 * about the tracee, and the consumer can then detect
172 * that the system call was interrupted (actually if
173 * this is a system call exit event, then it wasn't
174 * interrupted, but we couldn't finish tracing it.
175 * Kind of a small loophole).
176 */
177 LOG_INFO("tracee " << pid << " disappeared");
178 } else {
179 // something more severe
180 LOG_ERROR("tracee " << pid << " handling process event failed: " << ex.what());
181 }
182 }
183
184 checkCleanupTracee(it);
185 checkUnknownEvents();
186 return Decision::DONE;
187 } else {
188 return checkUnknownTraceeEvent(data);
189 }
190}
191
192Engine::Decision Engine::checkUnknownTraceeEvent(const cosmos::ChildState &data) {
193
194 if (data.trapped() && data.signal->isPtraceEventStop()) {
195 const auto [_, event] = cosmos::ptrace::decode_event(*data.signal);
196 if (event == cosmos::ptrace::Event::EXEC) {
197 /*
198 * This means execve() happened in a multi-threaded
199 * process, but we're not tracing the main thread,
200 * only the exec()'ing thread.
201 *
202 * The exec()'ing thread now has become the main
203 * thread, changing PID personality. Try to recover.
204 */
205 cosmos::Tracee ptrace{data.child.pid};
206 const auto former_pid = ptrace.getPIDEventMsg();
207 LOG_DEBUG("PID " << cosmos::to_integral(former_pid) << " issued execve(), but main thread is not traced. Trying to update records.");
208 if (tryUpdateTraceePID(former_pid, data.child.pid)) {
209 return Decision::RETRY;
210 }
211
212 return Decision::DROP;
213 } else if (event == cosmos::ptrace::Event::STOP) {
214 /*
215 * this can actually happen when an auto-attached
216 * child tracee is created and scheduled before the
217 * creating parent has had a chance to reports its
218 * PTRACE_EVENT_CLONE & friend event.
219 *
220 * Without knowing the relation of parent/child it's
221 * plain confusing to forwarding this to clients of
222 * Engine. Store the event and forward it once we see
223 * the creation event.
224 */
225 LOG_DEBUG("PID " << cosmos::to_integral(data.child.pid)
226 << " likely auto-attached tracee for which we didn't see the CLONE/[V]FORK event yet, storing event for later.");
227 return Decision::STORE;
228 }
229 }
230
231 return Decision::DROP;
232}
233
234bool Engine::tryUpdateTraceePID(const cosmos::ProcessID old_pid, const cosmos::ProcessID new_pid) {
235 auto node = m_tracees.extract(old_pid);
236 if (node.empty())
237 return false;
238 // don't change the Tracee object's PID just yet. This will be done
239 // in Tracee::handleExecEvent()).
240 //
241 // At this stage we just want to be able to lookup the correct Tracee
242 // within Engine for now.
243 node.key() = new_pid;
244
245 m_tracees.insert(std::move(node));
246 return true;
247}
248
249void Engine::stop(const std::optional<cosmos::Signal> signal) {
250 for (auto it = m_tracees.begin(); it != m_tracees.end(); it++) {
251 auto &tracee = *it->second;
252 if (tracee.isChildProcess() && tracee.alive() && signal) {
253 cosmos::signal::send(tracee.pid(), *signal);
254 }
255 tracee.detach();
256
257 if (!tracee.alive()) {
258 m_tracees.erase(it);
259 }
260 }
261}
262
263void Engine::handleAutoAttach(Tracee &parent, const cosmos::ProcessID pid,
264 const cosmos::ptrace::Event event, const SystemCall &sc) {
265
266 LOG_DEBUG("auto-attach for " << cosmos::to_integral(pid));
267
268 if (event != cosmos::ptrace::Event::VFORK_DONE) {
269 auto tracee = std::make_shared<AutoAttachedTracee>(
270 *this,
271 m_consumer,
272 m_tracees[parent.pid()]);
273
274 tracee->configure(pid, event, sc);
275
276 auto [it, _] = m_tracees.insert({pid, tracee});
277
278 EventConsumer::StatusFlags flags;
279 if (parent.hasClonedThread()) {
281 }
282 m_consumer.newChildProcess(parent, *it->second, event, flags);
283
284 checkCleanupTracee(it);
285 } else {
286 // if the pid is no longer found then it either already died
287 // or it was detached from
288 if (auto it = m_tracees.find(pid); it != m_tracees.end()) {
289 m_consumer.vforkComplete(parent, it->second);
290 } else {
291 m_consumer.vforkComplete(parent, nullptr);
292 }
293 }
294
295 /*
296 * If we have any unknown events stored, we can now check whether they
297 * have a matching object by now. Don't do this right away, because
298 * `parent` is still busy processing its event. Only do this after
299 * that is finished. This is the purpose of this flag.
300 */
302}
303
304TraceePtr Engine::handleSubstitution(const cosmos::ProcessID old_pid) {
305 /*
306 * The following scenarios exist for multi-threaded processes:
307 *
308 * a) we are tracing all threads of the process. Some thread calls
309 * execve(). We'll see all other threads exiting out of the blue. The
310 * main thread's Tracee object will set the
311 * WAIT_FOR_EXECVE_REPLACEMENT flag. Once the PID personality change
312 * happens we'll recycle the main thread's Tracee object to be used
313 * for further tracing. The reason for this is that the Tracee object
314 * may be a ChildTracee with ownership of a SubProc that needs to live
315 * on.
316 *
317 * b) we are only tracing a thread other than the main thread or the
318 * execve() thread: it will just exit and tracing ends
319 *
320 * c) we are only tracing the execve() thread which is not the main
321 * thread. Upon PID substitution we'll suddenly get a ptrace()
322 * event for a PID we never attached to. Engine decodes the event in
323 * `checkUnknownTraceeEvent()` and will update the key in `m_tracees`,
324 * then feed the event to the Tracee object. In this case the only
325 * available Tracee object will be kept.
326 *
327 * d) we are tracing only the main thread but another thread calls
328 * execve(): the thread disappears and `wait()` suddenly fails
329 * with ENOCHILD.
330 *
331 * What does strace do in these cases?
332 *
333 * - in case c) it sees the ptrace event for the changed PID, then
334 * detaches from it stating it is an unknown PID. Then tracing ends.
335 * - in case d) it fails with ENOCHILD and tracing ends.
336 *
337 * Technically is is possible to deal with case c), which is complex
338 * but managable. It makes sense to continue tracing in this case.
339 *
340 * In case d) it could be argued that is also makes sense to continue
341 * tracing, since the main thread is traced but continues in another
342 * context. Technically it is not possible to do this, though, because
343 * we have no information at all about the execve() that is happening
344 * until we lose the tracee. Thus in this case we try to detach cleanly.
345 * Clients of libclues can identify the exit reason via
346 * Tracee::flags() by looking for Flag::WAIT_FOR_EXECVE_REPLACEMENT,
347 * which will still be set in this scenario.
348 */
349
350 auto it = m_tracees.find(old_pid);
351
352 if (it == m_tracees.end()) {
353 // this is likely case c), the `old_pid` was not traced by us.
354 return nullptr;
355 }
356
357 auto &old_tracee = *it->second;
358
359 try {
360 old_tracee.detach();
361 } catch (const cosmos::ApiError &err) {
362 // it seems the kernel implicitly detaches at this point
363 if (err.errnum() != cosmos::Errno::SEARCH) {
364 throw;
365 }
366 }
367
368 auto ret = it->second;
369
370 m_tracees.erase(it);
371
372 /* The man page says at this stage we should forget about any other
373 * treads of the process that still exist.
374 * Is this really necessary? It also says that it is guaranteed that
375 * only two threads will still exist now: the main and the execve()
376 * thread. So forgetting about any other threads would only be
377 * relevant for the non-execve() thread that still remains. We already
378 * deal successfully with all the constellations that can occur, so I
379 * don't think there's any explicit "forgetting" to be implemented
380 * here.
381 */
382
383 return ret;
384}
385
386} // end ns
TraceeMap m_tracees
Currently active tracees.
Definition Engine.hxx:218
Decision
Different decisions what to do with ptrace events.
Definition Engine.hxx:174
@ DONE
the event has been successfully processed.
Definition Engine.hxx:178
@ STORE
store the event for later.
Definition Engine.hxx:177
@ DROP
ignore/drop the event.
Definition Engine.hxx:176
@ RETRY
retry processing the event.
Definition Engine.hxx:175
Decision checkUnknownTraceeEvent(const cosmos::ChildState &data)
Check the given trace event if we can make sense of it.
Definition Engine.cxx:192
TraceePtr handleSubstitution(const cosmos::ProcessID old_pid)
Invoked by a Tracee when multi-threaded execve() leads to substitution of a PID by another.
Definition Engine.cxx:304
TraceePtr addTracee(const cosmos::ProcessID pid, const FollowChildren follow_children, const AttachThreads attach_threads, const cosmos::ProcessID sibling=cosmos::ProcessID::INVALID)
Add the given pid as tracee.
Definition Engine.cxx:38
EventMap m_unknown_events
Unknown ptrace events stored for later processing.
Definition Engine.hxx:220
cosmos::ProcessID m_newly_attached_pid
The PID of a newly auto-attached Tracee, if any.
Definition Engine.hxx:222
virtual ~Engine()
Tear down any tracees.
Definition Engine.cxx:18
void handleAutoAttach(Tracee &parent, const cosmos::ProcessID pid, const cosmos::ptrace::Event event, const SystemCall &sc)
Invoked by a Tracee once a new child process is auto-attached.
Definition Engine.cxx:263
void stop(const std::optional< cosmos::Signal > signal)
Stop tracing any active tracees.
Definition Engine.cxx:249
void trace()
Enter the tracing main loop and process tracing events.
Definition Engine.cxx:119
@ CLONED_THREAD
used in newChildProcess() to indicate that a new thread has been created.
Access to System Call Data.
Base class for traced processes.
Definition Tracee.hxx:39
bool hasClonedThread() const
Returns whether the current/last seen system call was a clone() for a thread.
Definition Tracee.cxx:237
@ WAIT_FOR_EXITED
we've already seen PTHREAD_EVENT_EXIT but are still waiting for CLD_EXITED.
Definition Tracee.hxx:67
bool detach()
Attempt to detach the Tracee.
Definition Tracee.cxx:177
void processEvent(const cosmos::ChildState &data)
Process the given ptrace event.
Definition Tracee.cxx:837
@ DETACHED
we already detached from the tracee
Definition Tracee.hxx:57
@ DEAD
the tracee no longer exists.
Definition Tracee.hxx:56
virtual bool isChildProcess() const
Returns whether the tracee is a child process created by us.
Definition Tracee.hxx:245
cosmos::NamedBool< struct attach_threads_t, true > AttachThreads
A strong boolean type denoting whether to automatically all other threads of a process.
Definition types.hxx:27
cosmos::NamedBool< struct follow_children_t, true > FollowChildren
A strong boolean type denoting whether to automatically attach to newly created child processes.
Definition types.hxx:24