xref: /openbmc/google-misc/subprojects/metrics-ipmi-blobs/metric.cpp (revision 14fe6698f5bb26245bb807eb041eaa7a3c29ac39)
1 // Copyright 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 //      http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #include "metric.hpp"
16 
17 #include "metricblob.pb.h"
18 
19 #include "util.hpp"
20 
21 #include <sys/statvfs.h>
22 
23 #include <phosphor-logging/log.hpp>
24 
25 #include <cstdint>
26 #include <filesystem>
27 #include <sstream>
28 #include <string>
29 #include <string_view>
30 
31 namespace metric_blob
32 {
33 
34 using phosphor::logging::entry;
35 using phosphor::logging::log;
36 using level = phosphor::logging::level;
37 
38 BmcHealthSnapshot::BmcHealthSnapshot() :
39     done(false), stringId(0), ticksPerSec(0)
40 {}
41 
42 struct ProcStatEntry
43 {
44     std::string cmdline;
45     std::string tcomm;
46     float utime;
47     float stime;
48 
49     // Processes with the longest utime + stime are ranked first.
50     // Tie breaking is done with cmdline then tcomm.
51     bool operator<(const ProcStatEntry& other) const
52     {
53         const float negTime = -(utime + stime);
54         const float negOtherTime = -(other.utime + other.stime);
55         return std::tie(negTime, cmdline, tcomm) <
56                std::tie(negOtherTime, other.cmdline, other.tcomm);
57     }
58 };
59 
60 bmcmetrics::metricproto::BmcProcStatMetric BmcHealthSnapshot::getProcStatList()
61 {
62     constexpr std::string_view procPath = "/proc/";
63 
64     bmcmetrics::metricproto::BmcProcStatMetric ret;
65     std::vector<ProcStatEntry> entries;
66 
67     for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
68     {
69         const std::string& path = procEntry.path();
70         int pid = -1;
71         if (isNumericPath(path, pid))
72         {
73             ProcStatEntry entry;
74 
75             try
76             {
77                 entry.cmdline = getCmdLine(pid);
78                 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
79                 entry.tcomm = t.tcomm;
80                 entry.utime = t.utime;
81                 entry.stime = t.stime;
82 
83                 entries.push_back(entry);
84             }
85             catch (const std::exception& e)
86             {
87                 log<level::ERR>("Could not obtain process stats");
88             }
89         }
90     }
91 
92     std::sort(entries.begin(), entries.end());
93 
94     bool isOthers = false;
95     ProcStatEntry others;
96     others.cmdline = "(Others)";
97     others.utime = others.stime = 0;
98 
99     // Only show this many processes and aggregate all remaining ones into
100     // "others" in order to keep the size of the snapshot reasonably small.
101     // With 10 process stat entries and 10 FD count entries, the size of the
102     // snapshot reaches around 1.5KiB. This is non-trivial, and we have to set
103     // the collection interval long enough so as not to over-stress the IPMI
104     // interface and the data collection service. The value of 10 is chosen
105     // empirically, it might be subject to adjustments when the system is
106     // launched later.
107     constexpr int topN = 10;
108 
109     for (size_t i = 0; i < entries.size(); ++i)
110     {
111         if (i >= topN)
112         {
113             isOthers = true;
114         }
115 
116         ProcStatEntry& entry = entries[i];
117 
118         if (isOthers)
119         {
120             others.utime += entry.utime;
121             others.stime += entry.stime;
122         }
123         else
124         {
125             bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s;
126             std::string fullCmdline = entry.cmdline;
127             if (entry.tcomm.size() > 0)
128             {
129                 fullCmdline += " " + entry.tcomm;
130             }
131             s.set_sidx_cmdline(getStringID(fullCmdline));
132             s.set_utime(entry.utime);
133             s.set_stime(entry.stime);
134             *(ret.add_stats()) = s;
135         }
136     }
137 
138     if (isOthers)
139     {
140         bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s;
141         s.set_sidx_cmdline(getStringID(others.cmdline));
142         s.set_utime(others.utime);
143         s.set_stime(others.stime);
144         *(ret.add_stats()) = s;
145     }
146 
147     return ret;
148 }
149 
150 int getFdCount(int pid)
151 {
152     const std::string& fdPath = "/proc/" + std::to_string(pid) + "/fd";
153     return std::distance(std::filesystem::directory_iterator(fdPath),
154                          std::filesystem::directory_iterator{});
155 }
156 
157 struct FdStatEntry
158 {
159     int fdCount;
160     std::string cmdline;
161     std::string tcomm;
162 
163     // Processes with the largest fdCount goes first.
164     // Tie-breaking using cmdline then tcomm.
165     bool operator<(const FdStatEntry& other) const
166     {
167         const int negFdCount = -fdCount;
168         const int negOtherFdCount = -other.fdCount;
169         return std::tie(negFdCount, cmdline, tcomm) <
170                std::tie(negOtherFdCount, other.cmdline, other.tcomm);
171     }
172 };
173 
174 bmcmetrics::metricproto::BmcFdStatMetric BmcHealthSnapshot::getFdStatList()
175 {
176     bmcmetrics::metricproto::BmcFdStatMetric ret;
177 
178     // Sort by fd count, no tie-breaking
179     std::vector<FdStatEntry> entries;
180 
181     const std::string_view procPath = "/proc/";
182     for (const auto& procEntry : std::filesystem::directory_iterator(procPath))
183     {
184         const std::string& path = procEntry.path();
185         int pid = 0;
186         FdStatEntry entry;
187         if (isNumericPath(path, pid))
188         {
189             try
190             {
191                 entry.fdCount = getFdCount(pid);
192                 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec);
193                 entry.cmdline = getCmdLine(pid);
194                 entry.tcomm = t.tcomm;
195                 entries.push_back(entry);
196             }
197             catch (const std::exception& e)
198             {
199                 log<level::ERR>("Could not get file descriptor stats");
200             }
201         }
202     }
203 
204     std::sort(entries.begin(), entries.end());
205 
206     bool isOthers = false;
207 
208     // Only report the detailed fd count and cmdline for the top 10 entries,
209     // and collapse all others into "others".
210     constexpr int topN = 10;
211 
212     FdStatEntry others;
213     others.cmdline = "(Others)";
214     others.fdCount = 0;
215 
216     for (size_t i = 0; i < entries.size(); ++i)
217     {
218         if (i >= topN)
219         {
220             isOthers = true;
221         }
222 
223         const FdStatEntry& entry = entries[i];
224         if (isOthers)
225         {
226             others.fdCount += entry.fdCount;
227         }
228         else
229         {
230             bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s;
231             std::string fullCmdline = entry.cmdline;
232             if (entry.tcomm.size() > 0)
233             {
234                 fullCmdline += " " + entry.tcomm;
235             }
236             s.set_sidx_cmdline(getStringID(fullCmdline));
237             s.set_fd_count(entry.fdCount);
238             *(ret.add_stats()) = s;
239         }
240     }
241 
242     if (isOthers)
243     {
244         bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s;
245         s.set_sidx_cmdline(getStringID(others.cmdline));
246         s.set_fd_count(others.fdCount);
247         *(ret.add_stats()) = s;
248     }
249 
250     return ret;
251 }
252 
253 void BmcHealthSnapshot::serializeSnapshotToArray(
254     const bmcmetrics::metricproto::BmcMetricSnapshot& snapshot)
255 {
256     size_t size = snapshot.ByteSizeLong();
257     if (size > 0)
258     {
259         pbDump.resize(size);
260         if (!snapshot.SerializeToArray(pbDump.data(), size))
261         {
262             log<level::ERR>("Could not serialize protobuf to array");
263         }
264     }
265 }
266 
267 void BmcHealthSnapshot::doWork()
268 {
269     bmcmetrics::metricproto::BmcMetricSnapshot snapshot;
270 
271     // Memory info
272     std::string meminfoBuffer = readFileIntoString("/proc/meminfo");
273 
274     {
275         bmcmetrics::metricproto::BmcMemoryMetric m;
276 
277         std::string_view sv(meminfoBuffer.data());
278         // MemAvailable
279         int value;
280         bool ok = parseMeminfoValue(sv, "MemAvailable:", value);
281         if (ok)
282         {
283             m.set_mem_available(value);
284         }
285 
286         ok = parseMeminfoValue(sv, "Slab:", value);
287         if (ok)
288         {
289             m.set_slab(value);
290         }
291 
292         ok = parseMeminfoValue(sv, "KernelStack:", value);
293         if (ok)
294         {
295             m.set_kernel_stack(value);
296         }
297 
298         *(snapshot.mutable_memory_metric()) = m;
299     }
300 
301     // Uptime
302     std::string uptimeBuffer = readFileIntoString("/proc/uptime");
303     double uptime = 0, idleProcessTime = 0;
304     if (parseProcUptime(uptimeBuffer, uptime, idleProcessTime))
305     {
306         bmcmetrics::metricproto::BmcUptimeMetric m1;
307         m1.set_uptime(uptime);
308         m1.set_idle_process_time(idleProcessTime);
309         *(snapshot.mutable_uptime_metric()) = m1;
310     }
311     else
312     {
313         log<level::ERR>("Error parsing /proc/uptime");
314     }
315 
316     // Storage space
317     struct statvfs fiData;
318     if ((statvfs("/", &fiData)) < 0)
319     {
320         log<level::ERR>("Could not call statvfs");
321     }
322     else
323     {
324         uint64_t kib = (fiData.f_bsize * fiData.f_bfree) / 1024;
325         bmcmetrics::metricproto::BmcDiskSpaceMetric m2;
326         m2.set_rwfs_kib_available(static_cast<int>(kib));
327         *(snapshot.mutable_storage_space_metric()) = m2;
328     }
329 
330     // The next metrics require a sane ticks_per_sec value, typically 100 on
331     // the BMC. In the very rare circumstance when it's 0, exit early and return
332     // a partially complete snapshot (no process).
333     ticksPerSec = getTicksPerSec();
334 
335     // FD stat
336     *(snapshot.mutable_fdstat_metric()) = getFdStatList();
337 
338     if (ticksPerSec == 0)
339     {
340         log<level::ERR>("ticksPerSec is 0, skipping the process list metric");
341         serializeSnapshotToArray(snapshot);
342         done = true;
343         return;
344     }
345 
346     // Proc stat
347     *(snapshot.mutable_procstat_metric()) = getProcStatList();
348 
349     // String table
350     std::vector<std::string_view> strings(stringTable.size());
351     for (const auto& [s, i] : stringTable)
352     {
353         strings[i] = s;
354     }
355 
356     bmcmetrics::metricproto::BmcStringTable st;
357     for (size_t i = 0; i < strings.size(); ++i)
358     {
359         bmcmetrics::metricproto::BmcStringTable::StringEntry entry;
360         entry.set_value(strings[i].data());
361         *(st.add_entries()) = entry;
362     }
363     *(snapshot.mutable_string_table()) = st;
364 
365     // Save to buffer
366     serializeSnapshotToArray(snapshot);
367     done = true;
368 }
369 
370 // BmcBlobSessionStat (9) but passing meta as reference instead of pointer,
371 // since the metadata must not be null at this point.
372 bool BmcHealthSnapshot::stat(blobs::BlobMeta& meta)
373 {
374     if (!done)
375     {
376         // Bits 8~15 are blob-specific state flags.
377         // For this blob, bit 8 is set when metric collection is still in
378         // progress.
379         meta.blobState |= (1 << 8);
380     }
381     else
382     {
383         meta.blobState = 0;
384         meta.blobState = blobs::StateFlags::open_read;
385         meta.size = pbDump.size();
386     }
387     return true;
388 }
389 
390 std::string_view BmcHealthSnapshot::read(uint32_t offset,
391                                          uint32_t requestedSize)
392 {
393     uint32_t size = static_cast<uint32_t>(pbDump.size());
394     if (offset >= size)
395     {
396         return {};
397     }
398     return std::string_view(pbDump.data() + offset,
399                             std::min(requestedSize, size - offset));
400 }
401 
402 int BmcHealthSnapshot::getStringID(const std::string_view s)
403 {
404     int ret = 0;
405     auto itr = stringTable.find(s.data());
406     if (itr == stringTable.end())
407     {
408         stringTable[s.data()] = stringId;
409         ret = stringId;
410         ++stringId;
411     }
412     else
413     {
414         ret = itr->second;
415     }
416     return ret;
417 }
418 
419 } // namespace metric_blob