1 #include "metric.hpp" 2 3 #include "metricblob.pb.h" 4 5 #include "util.hpp" 6 7 #include <sys/statvfs.h> 8 9 #include <phosphor-logging/log.hpp> 10 11 #include <cstdint> 12 #include <filesystem> 13 #include <sstream> 14 #include <string> 15 #include <string_view> 16 17 namespace metric_blob 18 { 19 20 using phosphor::logging::entry; 21 using phosphor::logging::log; 22 using level = phosphor::logging::level; 23 24 BmcHealthSnapshot::BmcHealthSnapshot() : 25 done(false), stringId(0), ticksPerSec(0) 26 {} 27 28 struct ProcStatEntry 29 { 30 std::string cmdline; 31 std::string tcomm; 32 float utime; 33 float stime; 34 35 // Processes with the longest utime + stime are ranked first. 36 // Tie breaking is done with cmdline then tcomm. 37 bool operator<(const ProcStatEntry& other) const 38 { 39 const float negTime = -(utime + stime); 40 const float negOtherTime = -(other.utime + other.stime); 41 return std::tie(negTime, cmdline, tcomm) < 42 std::tie(negOtherTime, other.cmdline, other.tcomm); 43 } 44 }; 45 46 bmcmetrics::metricproto::BmcProcStatMetric BmcHealthSnapshot::getProcStatList() 47 { 48 constexpr std::string_view procPath = "/proc/"; 49 50 bmcmetrics::metricproto::BmcProcStatMetric ret; 51 std::vector<ProcStatEntry> entries; 52 53 for (const auto& procEntry : std::filesystem::directory_iterator(procPath)) 54 { 55 const std::string& path = procEntry.path(); 56 int pid = -1; 57 if (isNumericPath(path, pid)) 58 { 59 ProcStatEntry entry; 60 61 try 62 { 63 entry.cmdline = getCmdLine(pid); 64 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec); 65 entry.tcomm = t.tcomm; 66 entry.utime = t.utime; 67 entry.stime = t.stime; 68 69 entries.push_back(entry); 70 } 71 catch (const std::exception& e) 72 { 73 log<level::ERR>("Could not obtain process stats"); 74 } 75 } 76 } 77 78 std::sort(entries.begin(), entries.end()); 79 80 bool isOthers = false; 81 ProcStatEntry others; 82 others.cmdline = "(Others)"; 83 others.utime = others.stime = 0; 84 85 // Only show this many processes and aggregate all remaining ones into 86 // "others" in order to keep the size of the snapshot reasonably small. 87 // With 10 process stat entries and 10 FD count entries, the size of the 88 // snapshot reaches around 1.5KiB. This is non-trivial, and we have to set 89 // the collection interval long enough so as not to over-stress the IPMI 90 // interface and the data collection service. The value of 10 is chosen 91 // empirically, it might be subject to adjustments when the system is 92 // launched later. 93 constexpr int topN = 10; 94 95 for (size_t i = 0; i < entries.size(); ++i) 96 { 97 if (i >= topN) 98 { 99 isOthers = true; 100 } 101 102 ProcStatEntry& entry = entries[i]; 103 104 if (isOthers) 105 { 106 others.utime += entry.utime; 107 others.stime += entry.stime; 108 } 109 else 110 { 111 bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s; 112 std::string fullCmdline = entry.cmdline; 113 if (entry.tcomm.size() > 0) 114 { 115 fullCmdline += " " + entry.tcomm; 116 } 117 s.set_sidx_cmdline(getStringID(fullCmdline)); 118 s.set_utime(entry.utime); 119 s.set_stime(entry.stime); 120 *(ret.add_stats()) = s; 121 } 122 } 123 124 if (isOthers) 125 { 126 bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s; 127 s.set_sidx_cmdline(getStringID(others.cmdline)); 128 s.set_utime(others.utime); 129 s.set_stime(others.stime); 130 *(ret.add_stats()) = s; 131 } 132 133 return ret; 134 } 135 136 int getFdCount(int pid) 137 { 138 const std::string& fdPath = "/proc/" + std::to_string(pid) + "/fd"; 139 return std::distance(std::filesystem::directory_iterator(fdPath), 140 std::filesystem::directory_iterator{}); 141 } 142 143 struct FdStatEntry 144 { 145 int fdCount; 146 std::string cmdline; 147 std::string tcomm; 148 149 // Processes with the largest fdCount goes first. 150 // Tie-breaking using cmdline then tcomm. 151 bool operator<(const FdStatEntry& other) const 152 { 153 const int negFdCount = -fdCount; 154 const int negOtherFdCount = -other.fdCount; 155 return std::tie(negFdCount, cmdline, tcomm) < 156 std::tie(negOtherFdCount, other.cmdline, other.tcomm); 157 } 158 }; 159 160 bmcmetrics::metricproto::BmcFdStatMetric BmcHealthSnapshot::getFdStatList() 161 { 162 bmcmetrics::metricproto::BmcFdStatMetric ret; 163 164 // Sort by fd count, no tie-breaking 165 std::vector<FdStatEntry> entries; 166 167 const std::string_view procPath = "/proc/"; 168 for (const auto& procEntry : std::filesystem::directory_iterator(procPath)) 169 { 170 const std::string& path = procEntry.path(); 171 int pid = 0; 172 FdStatEntry entry; 173 if (isNumericPath(path, pid)) 174 { 175 try 176 { 177 entry.fdCount = getFdCount(pid); 178 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec); 179 entry.cmdline = getCmdLine(pid); 180 entry.tcomm = t.tcomm; 181 entries.push_back(entry); 182 } 183 catch (const std::exception& e) 184 { 185 log<level::ERR>("Could not get file descriptor stats"); 186 } 187 } 188 } 189 190 std::sort(entries.begin(), entries.end()); 191 192 bool isOthers = false; 193 194 // Only report the detailed fd count and cmdline for the top 10 entries, 195 // and collapse all others into "others". 196 constexpr int topN = 10; 197 198 FdStatEntry others; 199 others.cmdline = "(Others)"; 200 others.fdCount = 0; 201 202 for (size_t i = 0; i < entries.size(); ++i) 203 { 204 if (i >= topN) 205 { 206 isOthers = true; 207 } 208 209 const FdStatEntry& entry = entries[i]; 210 if (isOthers) 211 { 212 others.fdCount += entry.fdCount; 213 } 214 else 215 { 216 bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s; 217 std::string fullCmdline = entry.cmdline; 218 if (entry.tcomm.size() > 0) 219 { 220 fullCmdline += " " + entry.tcomm; 221 } 222 s.set_sidx_cmdline(getStringID(fullCmdline)); 223 s.set_fd_count(entry.fdCount); 224 *(ret.add_stats()) = s; 225 } 226 } 227 228 if (isOthers) 229 { 230 bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s; 231 s.set_sidx_cmdline(getStringID(others.cmdline)); 232 s.set_fd_count(others.fdCount); 233 *(ret.add_stats()) = s; 234 } 235 236 return ret; 237 } 238 239 void BmcHealthSnapshot::serializeSnapshotToArray( 240 const bmcmetrics::metricproto::BmcMetricSnapshot& snapshot) 241 { 242 size_t size = snapshot.ByteSizeLong(); 243 if (size > 0) 244 { 245 pbDump.resize(size); 246 if (!snapshot.SerializeToArray(pbDump.data(), size)) 247 { 248 log<level::ERR>("Could not serialize protobuf to array"); 249 } 250 } 251 } 252 253 void BmcHealthSnapshot::doWork() 254 { 255 bmcmetrics::metricproto::BmcMetricSnapshot snapshot; 256 257 // Memory info 258 std::string meminfoBuffer = readFileIntoString("/proc/meminfo"); 259 260 { 261 bmcmetrics::metricproto::BmcMemoryMetric m; 262 263 std::string_view sv(meminfoBuffer.data()); 264 // MemAvailable 265 int value; 266 bool ok = parseMeminfoValue(sv, "MemAvailable:", value); 267 if (ok) 268 { 269 m.set_mem_available(value); 270 } 271 272 ok = parseMeminfoValue(sv, "Slab:", value); 273 if (ok) 274 { 275 m.set_slab(value); 276 } 277 278 ok = parseMeminfoValue(sv, "KernelStack:", value); 279 if (ok) 280 { 281 m.set_kernel_stack(value); 282 } 283 284 *(snapshot.mutable_memory_metric()) = m; 285 } 286 287 // Uptime 288 std::string uptimeBuffer = readFileIntoString("/proc/uptime"); 289 double uptime = 0, idleProcessTime = 0; 290 if (parseProcUptime(uptimeBuffer, uptime, idleProcessTime)) 291 { 292 bmcmetrics::metricproto::BmcUptimeMetric m1; 293 m1.set_uptime(uptime); 294 m1.set_idle_process_time(idleProcessTime); 295 *(snapshot.mutable_uptime_metric()) = m1; 296 } 297 else 298 { 299 log<level::ERR>("Error parsing /proc/uptime"); 300 } 301 302 // Storage space 303 struct statvfs fiData; 304 if ((statvfs("/", &fiData)) < 0) 305 { 306 log<level::ERR>("Could not call statvfs"); 307 } 308 else 309 { 310 uint64_t kib = (fiData.f_bsize * fiData.f_bfree) / 1024; 311 bmcmetrics::metricproto::BmcDiskSpaceMetric m2; 312 m2.set_rwfs_kib_available(static_cast<int>(kib)); 313 *(snapshot.mutable_storage_space_metric()) = m2; 314 } 315 316 // The next metrics require a sane ticks_per_sec value, typically 100 on 317 // the BMC. In the very rare circumstance when it's 0, exit early and return 318 // a partially complete snapshot (no process). 319 ticksPerSec = getTicksPerSec(); 320 321 // FD stat 322 *(snapshot.mutable_fdstat_metric()) = getFdStatList(); 323 324 if (ticksPerSec == 0) 325 { 326 log<level::ERR>("ticksPerSec is 0, skipping the process list metric"); 327 serializeSnapshotToArray(snapshot); 328 done = true; 329 return; 330 } 331 332 // Proc stat 333 *(snapshot.mutable_procstat_metric()) = getProcStatList(); 334 335 // String table 336 std::vector<std::string_view> strings(stringTable.size()); 337 for (const auto& [s, i] : stringTable) 338 { 339 strings[i] = s; 340 } 341 342 bmcmetrics::metricproto::BmcStringTable st; 343 for (size_t i = 0; i < strings.size(); ++i) 344 { 345 bmcmetrics::metricproto::BmcStringTable::StringEntry entry; 346 entry.set_value(strings[i].data()); 347 *(st.add_entries()) = entry; 348 } 349 *(snapshot.mutable_string_table()) = st; 350 351 // Save to buffer 352 serializeSnapshotToArray(snapshot); 353 done = true; 354 } 355 356 // BmcBlobSessionStat (9) but passing meta as reference instead of pointer, 357 // since the metadata must not be null at this point. 358 bool BmcHealthSnapshot::stat(blobs::BlobMeta& meta) 359 { 360 if (!done) 361 { 362 // Bits 8~15 are blob-specific state flags. 363 // For this blob, bit 8 is set when metric collection is still in 364 // progress. 365 meta.blobState |= (1 << 8); 366 } 367 else 368 { 369 meta.blobState = 0; 370 meta.blobState = blobs::StateFlags::open_read; 371 meta.size = pbDump.size(); 372 } 373 return true; 374 } 375 376 std::string_view BmcHealthSnapshot::read(uint32_t offset, 377 uint32_t requestedSize) 378 { 379 uint32_t size = static_cast<uint32_t>(pbDump.size()); 380 if (offset >= size) 381 { 382 return {}; 383 } 384 return std::string_view(pbDump.data() + offset, 385 std::min(requestedSize, size - offset)); 386 } 387 388 int BmcHealthSnapshot::getStringID(const std::string_view s) 389 { 390 int ret = 0; 391 auto itr = stringTable.find(s.data()); 392 if (itr == stringTable.end()) 393 { 394 stringTable[s.data()] = stringId; 395 ret = stringId; 396 ++stringId; 397 } 398 else 399 { 400 ret = itr->second; 401 } 402 return ret; 403 } 404 405 } // namespace metric_blob