1 #include "metric.hpp"
2 
3 #include "metricblob.pb.h"
4 
5 #include "util.hpp"
6 
7 #include <sys/statvfs.h>
8 
9 #include <phosphor-logging/log.hpp>
10 
11 #include <cstdint>
12 #include <filesystem>
13 #include <sstream>
14 #include <string>
15 #include <string_view>
16 
17 namespace metric_blob
18 {
19 
20 using phosphor::logging::entry;
21 using phosphor::logging::log;
22 using level = phosphor::logging::level;
23 
24 BmcHealthSnapshot::BmcHealthSnapshot() :
25     done(false), stringId(0), ticksPerSec(0)
26 {}
27 
28 struct ProcStatEntry
29 {
30     std::string cmdline;
31     std::string tcomm;
32     float utime;
33     float stime;
34 
35     // Processes with the longest utime + stime are ranked first.
36     // Tie breaking is done with cmdline then tcomm.
37     bool operator<(const ProcStatEntry& other) const
38     {
39         const float negTime = -(utime + stime);
40         const float negOtherTime = -(other.utime + other.stime);
41         return std::tie(negTime, cmdline, tcomm) <
42                std::tie(negOtherTime, other.cmdline, other.tcomm);
43     }
44 };
45 
46 bmcmetrics::metricproto::BmcProcStatMetric BmcHealthSnapshot::getProcStatList()
47 {
48     constexpr std::string_view procPath = "/proc/";
49 
50     bmcmetrics::metricproto::BmcProcStatMetric ret;
51     std::vector<ProcStatEntry> entries;
52 
53     for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
54     {
55         const std::string& path = procEntry.path();
56         int pid = -1;
57         if (isNumericPath(path, pid))
58         {
59             ProcStatEntry entry;
60 
61             try
62             {
63                 entry.cmdline = getCmdLine(pid);
64                 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
65                 entry.tcomm = t.tcomm;
66                 entry.utime = t.utime;
67                 entry.stime = t.stime;
68 
69                 entries.push_back(entry);
70             }
71             catch (const std::exception& e)
72             {
73                 log<level::ERR>("Could not obtain process stats");
74             }
75         }
76     }
77 
78     std::sort(entries.begin(), entries.end());
79 
80     bool isOthers = false;
81     ProcStatEntry others;
82     others.cmdline = "(Others)";
83     others.utime = others.stime = 0;
84 
85     // Only show this many processes and aggregate all remaining ones into
86     // "others" in order to keep the size of the snapshot reasonably small.
87     // With 10 process stat entries and 10 FD count entries, the size of the
88     // snapshot reaches around 1.5KiB. This is non-trivial, and we have to set
89     // the collection interval long enough so as not to over-stress the IPMI
90     // interface and the data collection service. The value of 10 is chosen
91     // empirically, it might be subject to adjustments when the system is
92     // launched later.
93     constexpr int topN = 10;
94 
95     for (size_t i = 0; i < entries.size(); ++i)
96     {
97         if (i >= topN)
98         {
99             isOthers = true;
100         }
101 
102         ProcStatEntry& entry = entries[i];
103 
104         if (isOthers)
105         {
106             others.utime += entry.utime;
107             others.stime += entry.stime;
108         }
109         else
110         {
111             bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s;
112             std::string fullCmdline = entry.cmdline;
113             if (entry.tcomm.size() > 0)
114             {
115                 fullCmdline += " " + entry.tcomm;
116             }
117             s.set_sidx_cmdline(getStringID(fullCmdline));
118             s.set_utime(entry.utime);
119             s.set_stime(entry.stime);
120             *(ret.add_stats()) = s;
121         }
122     }
123 
124     if (isOthers)
125     {
126         bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s;
127         s.set_sidx_cmdline(getStringID(others.cmdline));
128         s.set_utime(others.utime);
129         s.set_stime(others.stime);
130         *(ret.add_stats()) = s;
131     }
132 
133     return ret;
134 }
135 
136 int getFdCount(int pid)
137 {
138     const std::string& fdPath = "/proc/" + std::to_string(pid) + "/fd";
139     return std::distance(std::filesystem::directory_iterator(fdPath),
140                          std::filesystem::directory_iterator{});
141 }
142 
143 struct FdStatEntry
144 {
145     int fdCount;
146     std::string cmdline;
147     std::string tcomm;
148 
149     // Processes with the largest fdCount goes first.
150     // Tie-breaking using cmdline then tcomm.
151     bool operator<(const FdStatEntry& other) const
152     {
153         const int negFdCount = -fdCount;
154         const int negOtherFdCount = -other.fdCount;
155         return std::tie(negFdCount, cmdline, tcomm) <
156                std::tie(negOtherFdCount, other.cmdline, other.tcomm);
157     }
158 };
159 
160 bmcmetrics::metricproto::BmcFdStatMetric BmcHealthSnapshot::getFdStatList()
161 {
162     bmcmetrics::metricproto::BmcFdStatMetric ret;
163 
164     // Sort by fd count, no tie-breaking
165     std::vector<FdStatEntry> entries;
166 
167     const std::string_view procPath = "/proc/";
168     for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
169     {
170         const std::string& path = procEntry.path();
171         int pid = 0;
172         FdStatEntry entry;
173         if (isNumericPath(path, pid))
174         {
175             try
176             {
177                 entry.fdCount = getFdCount(pid);
178                 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
179                 entry.cmdline = getCmdLine(pid);
180                 entry.tcomm = t.tcomm;
181                 entries.push_back(entry);
182             }
183             catch (const std::exception& e)
184             {
185                 log<level::ERR>("Could not get file descriptor stats");
186             }
187         }
188     }
189 
190     std::sort(entries.begin(), entries.end());
191 
192     bool isOthers = false;
193 
194     // Only report the detailed fd count and cmdline for the top 10 entries,
195     // and collapse all others into "others".
196     constexpr int topN = 10;
197 
198     FdStatEntry others;
199     others.cmdline = "(Others)";
200     others.fdCount = 0;
201 
202     for (size_t i = 0; i < entries.size(); ++i)
203     {
204         if (i >= topN)
205         {
206             isOthers = true;
207         }
208 
209         const FdStatEntry& entry = entries[i];
210         if (isOthers)
211         {
212             others.fdCount += entry.fdCount;
213         }
214         else
215         {
216             bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s;
217             std::string fullCmdline = entry.cmdline;
218             if (entry.tcomm.size() > 0)
219             {
220                 fullCmdline += " " + entry.tcomm;
221             }
222             s.set_sidx_cmdline(getStringID(fullCmdline));
223             s.set_fd_count(entry.fdCount);
224             *(ret.add_stats()) = s;
225         }
226     }
227 
228     if (isOthers)
229     {
230         bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s;
231         s.set_sidx_cmdline(getStringID(others.cmdline));
232         s.set_fd_count(others.fdCount);
233         *(ret.add_stats()) = s;
234     }
235 
236     return ret;
237 }
238 
239 void BmcHealthSnapshot::serializeSnapshotToArray(
240     const bmcmetrics::metricproto::BmcMetricSnapshot& snapshot)
241 {
242     size_t size = snapshot.ByteSizeLong();
243     if (size > 0)
244     {
245         pbDump.resize(size);
246         if (!snapshot.SerializeToArray(pbDump.data(), size))
247         {
248             log<level::ERR>("Could not serialize protobuf to array");
249         }
250     }
251 }
252 
253 void BmcHealthSnapshot::doWork()
254 {
255     bmcmetrics::metricproto::BmcMetricSnapshot snapshot;
256 
257     // Memory info
258     std::string meminfoBuffer = readFileIntoString("/proc/meminfo");
259 
260     {
261         bmcmetrics::metricproto::BmcMemoryMetric m;
262 
263         std::string_view sv(meminfoBuffer.data());
264         // MemAvailable
265         int value;
266         bool ok = parseMeminfoValue(sv, "MemAvailable:", value);
267         if (ok)
268         {
269             m.set_mem_available(value);
270         }
271 
272         ok = parseMeminfoValue(sv, "Slab:", value);
273         if (ok)
274         {
275             m.set_slab(value);
276         }
277 
278         ok = parseMeminfoValue(sv, "KernelStack:", value);
279         if (ok)
280         {
281             m.set_kernel_stack(value);
282         }
283 
284         *(snapshot.mutable_memory_metric()) = m;
285     }
286 
287     // Uptime
288     std::string uptimeBuffer = readFileIntoString("/proc/uptime");
289     double uptime = 0, idleProcessTime = 0;
290     if (parseProcUptime(uptimeBuffer, uptime, idleProcessTime))
291     {
292         bmcmetrics::metricproto::BmcUptimeMetric m1;
293         m1.set_uptime(uptime);
294         m1.set_idle_process_time(idleProcessTime);
295         *(snapshot.mutable_uptime_metric()) = m1;
296     }
297     else
298     {
299         log<level::ERR>("Error parsing /proc/uptime");
300     }
301 
302     // Storage space
303     struct statvfs fiData;
304     if ((statvfs("/", &fiData)) < 0)
305     {
306         log<level::ERR>("Could not call statvfs");
307     }
308     else
309     {
310         uint64_t kib = (fiData.f_bsize * fiData.f_bfree) / 1024;
311         bmcmetrics::metricproto::BmcDiskSpaceMetric m2;
312         m2.set_rwfs_kib_available(static_cast<int>(kib));
313         *(snapshot.mutable_storage_space_metric()) = m2;
314     }
315 
316     // The next metrics require a sane ticks_per_sec value, typically 100 on
317     // the BMC. In the very rare circumstance when it's 0, exit early and return
318     // a partially complete snapshot (no process).
319     ticksPerSec = getTicksPerSec();
320 
321     // FD stat
322     *(snapshot.mutable_fdstat_metric()) = getFdStatList();
323 
324     if (ticksPerSec == 0)
325     {
326         log<level::ERR>("ticksPerSec is 0, skipping the process list metric");
327         serializeSnapshotToArray(snapshot);
328         done = true;
329         return;
330     }
331 
332     // Proc stat
333     *(snapshot.mutable_procstat_metric()) = getProcStatList();
334 
335     // String table
336     std::vector<std::string_view> strings(stringTable.size());
337     for (const auto& [s, i] : stringTable)
338     {
339         strings[i] = s;
340     }
341 
342     bmcmetrics::metricproto::BmcStringTable st;
343     for (size_t i = 0; i < strings.size(); ++i)
344     {
345         bmcmetrics::metricproto::BmcStringTable::StringEntry entry;
346         entry.set_value(strings[i].data());
347         *(st.add_entries()) = entry;
348     }
349     *(snapshot.mutable_string_table()) = st;
350 
351     // Save to buffer
352     serializeSnapshotToArray(snapshot);
353     done = true;
354 }
355 
356 // BmcBlobSessionStat (9) but passing meta as reference instead of pointer,
357 // since the metadata must not be null at this point.
358 bool BmcHealthSnapshot::stat(blobs::BlobMeta& meta)
359 {
360     if (!done)
361     {
362         // Bits 8~15 are blob-specific state flags.
363         // For this blob, bit 8 is set when metric collection is still in
364         // progress.
365         meta.blobState |= (1 << 8);
366     }
367     else
368     {
369         meta.blobState = 0;
370         meta.blobState = blobs::StateFlags::open_read;
371         meta.size = pbDump.size();
372     }
373     return true;
374 }
375 
376 std::string_view BmcHealthSnapshot::read(uint32_t offset,
377                                          uint32_t requestedSize)
378 {
379     uint32_t size = static_cast<uint32_t>(pbDump.size());
380     if (offset >= size)
381     {
382         return {};
383     }
384     return std::string_view(pbDump.data() + offset,
385                             std::min(requestedSize, size - offset));
386 }
387 
388 int BmcHealthSnapshot::getStringID(const std::string_view s)
389 {
390     int ret = 0;
391     auto itr = stringTable.find(s.data());
392     if (itr == stringTable.end())
393     {
394         stringTable[s.data()] = stringId;
395         ret = stringId;
396         ++stringId;
397     }
398     else
399     {
400         ret = itr->second;
401     }
402     return ret;
403 }
404 
405 } // namespace metric_blob