1 // Copyright 2021 Google LLC 2 // 3 // Licensed under the Apache License, Version 2.0 (the "License"); 4 // you may not use this file except in compliance with the License. 5 // You may obtain a copy of the License at 6 // 7 // http://www.apache.org/licenses/LICENSE-2.0 8 // 9 // Unless required by applicable law or agreed to in writing, software 10 // distributed under the License is distributed on an "AS IS" BASIS, 11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 // See the License for the specific language governing permissions and 13 // limitations under the License. 14 15 #include "metric.hpp" 16 17 #include "metricblob.pb.h" 18 19 #include "util.hpp" 20 21 #include <sys/statvfs.h> 22 23 #include <phosphor-logging/log.hpp> 24 25 #include <cstdint> 26 #include <filesystem> 27 #include <sstream> 28 #include <string> 29 #include <string_view> 30 31 namespace metric_blob 32 { 33 34 using phosphor::logging::entry; 35 using phosphor::logging::log; 36 using level = phosphor::logging::level; 37 38 BmcHealthSnapshot::BmcHealthSnapshot() : 39 done(false), stringId(0), ticksPerSec(0) 40 {} 41 42 struct ProcStatEntry 43 { 44 std::string cmdline; 45 std::string tcomm; 46 float utime; 47 float stime; 48 49 // Processes with the longest utime + stime are ranked first. 50 // Tie breaking is done with cmdline then tcomm. 51 bool operator<(const ProcStatEntry& other) const 52 { 53 const float negTime = -(utime + stime); 54 const float negOtherTime = -(other.utime + other.stime); 55 return std::tie(negTime, cmdline, tcomm) < 56 std::tie(negOtherTime, other.cmdline, other.tcomm); 57 } 58 }; 59 60 bmcmetrics::metricproto::BmcProcStatMetric BmcHealthSnapshot::getProcStatList() 61 { 62 constexpr std::string_view procPath = "/proc/"; 63 64 bmcmetrics::metricproto::BmcProcStatMetric ret; 65 std::vector<ProcStatEntry> entries; 66 67 for (const auto& procEntry : std::filesystem::directory_iterator(procPath)) 68 { 69 const std::string& path = procEntry.path(); 70 int pid = -1; 71 if (isNumericPath(path, pid)) 72 { 73 ProcStatEntry entry; 74 75 try 76 { 77 entry.cmdline = getCmdLine(pid); 78 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec); 79 entry.tcomm = t.tcomm; 80 entry.utime = t.utime; 81 entry.stime = t.stime; 82 83 entries.push_back(entry); 84 } 85 catch (const std::exception& e) 86 { 87 log<level::ERR>("Could not obtain process stats"); 88 } 89 } 90 } 91 92 std::sort(entries.begin(), entries.end()); 93 94 bool isOthers = false; 95 ProcStatEntry others; 96 others.cmdline = "(Others)"; 97 others.utime = others.stime = 0; 98 99 // Only show this many processes and aggregate all remaining ones into 100 // "others" in order to keep the size of the snapshot reasonably small. 101 // With 10 process stat entries and 10 FD count entries, the size of the 102 // snapshot reaches around 1.5KiB. This is non-trivial, and we have to set 103 // the collection interval long enough so as not to over-stress the IPMI 104 // interface and the data collection service. The value of 10 is chosen 105 // empirically, it might be subject to adjustments when the system is 106 // launched later. 107 constexpr int topN = 10; 108 109 for (size_t i = 0; i < entries.size(); ++i) 110 { 111 if (i >= topN) 112 { 113 isOthers = true; 114 } 115 116 ProcStatEntry& entry = entries[i]; 117 118 if (isOthers) 119 { 120 others.utime += entry.utime; 121 others.stime += entry.stime; 122 } 123 else 124 { 125 bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s; 126 std::string fullCmdline = entry.cmdline; 127 if (entry.tcomm.size() > 0) 128 { 129 fullCmdline += " " + entry.tcomm; 130 } 131 s.set_sidx_cmdline(getStringID(fullCmdline)); 132 s.set_utime(entry.utime); 133 s.set_stime(entry.stime); 134 *(ret.add_stats()) = s; 135 } 136 } 137 138 if (isOthers) 139 { 140 bmcmetrics::metricproto::BmcProcStatMetric::BmcProcStat s; 141 s.set_sidx_cmdline(getStringID(others.cmdline)); 142 s.set_utime(others.utime); 143 s.set_stime(others.stime); 144 *(ret.add_stats()) = s; 145 } 146 147 return ret; 148 } 149 150 int getFdCount(int pid) 151 { 152 const std::string& fdPath = "/proc/" + std::to_string(pid) + "/fd"; 153 return std::distance(std::filesystem::directory_iterator(fdPath), 154 std::filesystem::directory_iterator{}); 155 } 156 157 struct FdStatEntry 158 { 159 int fdCount; 160 std::string cmdline; 161 std::string tcomm; 162 163 // Processes with the largest fdCount goes first. 164 // Tie-breaking using cmdline then tcomm. 165 bool operator<(const FdStatEntry& other) const 166 { 167 const int negFdCount = -fdCount; 168 const int negOtherFdCount = -other.fdCount; 169 return std::tie(negFdCount, cmdline, tcomm) < 170 std::tie(negOtherFdCount, other.cmdline, other.tcomm); 171 } 172 }; 173 174 bmcmetrics::metricproto::BmcFdStatMetric BmcHealthSnapshot::getFdStatList() 175 { 176 bmcmetrics::metricproto::BmcFdStatMetric ret; 177 178 // Sort by fd count, no tie-breaking 179 std::vector<FdStatEntry> entries; 180 181 const std::string_view procPath = "/proc/"; 182 for (const auto& procEntry : std::filesystem::directory_iterator(procPath)) 183 { 184 const std::string& path = procEntry.path(); 185 int pid = 0; 186 FdStatEntry entry; 187 if (isNumericPath(path, pid)) 188 { 189 try 190 { 191 entry.fdCount = getFdCount(pid); 192 TcommUtimeStime t = getTcommUtimeStime(pid, ticksPerSec); 193 entry.cmdline = getCmdLine(pid); 194 entry.tcomm = t.tcomm; 195 entries.push_back(entry); 196 } 197 catch (const std::exception& e) 198 { 199 log<level::ERR>("Could not get file descriptor stats"); 200 } 201 } 202 } 203 204 std::sort(entries.begin(), entries.end()); 205 206 bool isOthers = false; 207 208 // Only report the detailed fd count and cmdline for the top 10 entries, 209 // and collapse all others into "others". 210 constexpr int topN = 10; 211 212 FdStatEntry others; 213 others.cmdline = "(Others)"; 214 others.fdCount = 0; 215 216 for (size_t i = 0; i < entries.size(); ++i) 217 { 218 if (i >= topN) 219 { 220 isOthers = true; 221 } 222 223 const FdStatEntry& entry = entries[i]; 224 if (isOthers) 225 { 226 others.fdCount += entry.fdCount; 227 } 228 else 229 { 230 bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s; 231 std::string fullCmdline = entry.cmdline; 232 if (entry.tcomm.size() > 0) 233 { 234 fullCmdline += " " + entry.tcomm; 235 } 236 s.set_sidx_cmdline(getStringID(fullCmdline)); 237 s.set_fd_count(entry.fdCount); 238 *(ret.add_stats()) = s; 239 } 240 } 241 242 if (isOthers) 243 { 244 bmcmetrics::metricproto::BmcFdStatMetric::BmcFdStat s; 245 s.set_sidx_cmdline(getStringID(others.cmdline)); 246 s.set_fd_count(others.fdCount); 247 *(ret.add_stats()) = s; 248 } 249 250 return ret; 251 } 252 253 void BmcHealthSnapshot::serializeSnapshotToArray( 254 const bmcmetrics::metricproto::BmcMetricSnapshot& snapshot) 255 { 256 size_t size = snapshot.ByteSizeLong(); 257 if (size > 0) 258 { 259 pbDump.resize(size); 260 if (!snapshot.SerializeToArray(pbDump.data(), size)) 261 { 262 log<level::ERR>("Could not serialize protobuf to array"); 263 } 264 } 265 } 266 267 void BmcHealthSnapshot::doWork() 268 { 269 bmcmetrics::metricproto::BmcMetricSnapshot snapshot; 270 271 // Memory info 272 std::string meminfoBuffer = readFileIntoString("/proc/meminfo"); 273 274 { 275 bmcmetrics::metricproto::BmcMemoryMetric m; 276 277 std::string_view sv(meminfoBuffer.data()); 278 // MemAvailable 279 int value; 280 bool ok = parseMeminfoValue(sv, "MemAvailable:", value); 281 if (ok) 282 { 283 m.set_mem_available(value); 284 } 285 286 ok = parseMeminfoValue(sv, "Slab:", value); 287 if (ok) 288 { 289 m.set_slab(value); 290 } 291 292 ok = parseMeminfoValue(sv, "KernelStack:", value); 293 if (ok) 294 { 295 m.set_kernel_stack(value); 296 } 297 298 *(snapshot.mutable_memory_metric()) = m; 299 } 300 301 // Uptime 302 std::string uptimeBuffer = readFileIntoString("/proc/uptime"); 303 double uptime = 0, idleProcessTime = 0; 304 if (parseProcUptime(uptimeBuffer, uptime, idleProcessTime)) 305 { 306 bmcmetrics::metricproto::BmcUptimeMetric m1; 307 m1.set_uptime(uptime); 308 m1.set_idle_process_time(idleProcessTime); 309 *(snapshot.mutable_uptime_metric()) = m1; 310 } 311 else 312 { 313 log<level::ERR>("Error parsing /proc/uptime"); 314 } 315 316 // Storage space 317 struct statvfs fiData; 318 if ((statvfs("/", &fiData)) < 0) 319 { 320 log<level::ERR>("Could not call statvfs"); 321 } 322 else 323 { 324 uint64_t kib = (fiData.f_bsize * fiData.f_bfree) / 1024; 325 bmcmetrics::metricproto::BmcDiskSpaceMetric m2; 326 m2.set_rwfs_kib_available(static_cast<int>(kib)); 327 *(snapshot.mutable_storage_space_metric()) = m2; 328 } 329 330 // The next metrics require a sane ticks_per_sec value, typically 100 on 331 // the BMC. In the very rare circumstance when it's 0, exit early and return 332 // a partially complete snapshot (no process). 333 ticksPerSec = getTicksPerSec(); 334 335 // FD stat 336 *(snapshot.mutable_fdstat_metric()) = getFdStatList(); 337 338 if (ticksPerSec == 0) 339 { 340 log<level::ERR>("ticksPerSec is 0, skipping the process list metric"); 341 serializeSnapshotToArray(snapshot); 342 done = true; 343 return; 344 } 345 346 // Proc stat 347 *(snapshot.mutable_procstat_metric()) = getProcStatList(); 348 349 // String table 350 std::vector<std::string_view> strings(stringTable.size()); 351 for (const auto& [s, i] : stringTable) 352 { 353 strings[i] = s; 354 } 355 356 bmcmetrics::metricproto::BmcStringTable st; 357 for (size_t i = 0; i < strings.size(); ++i) 358 { 359 bmcmetrics::metricproto::BmcStringTable::StringEntry entry; 360 entry.set_value(strings[i].data()); 361 *(st.add_entries()) = entry; 362 } 363 *(snapshot.mutable_string_table()) = st; 364 365 // Save to buffer 366 serializeSnapshotToArray(snapshot); 367 done = true; 368 } 369 370 // BmcBlobSessionStat (9) but passing meta as reference instead of pointer, 371 // since the metadata must not be null at this point. 372 bool BmcHealthSnapshot::stat(blobs::BlobMeta& meta) 373 { 374 if (!done) 375 { 376 // Bits 8~15 are blob-specific state flags. 377 // For this blob, bit 8 is set when metric collection is still in 378 // progress. 379 meta.blobState |= (1 << 8); 380 } 381 else 382 { 383 meta.blobState = 0; 384 meta.blobState = blobs::StateFlags::open_read; 385 meta.size = pbDump.size(); 386 } 387 return true; 388 } 389 390 std::string_view BmcHealthSnapshot::read(uint32_t offset, 391 uint32_t requestedSize) 392 { 393 uint32_t size = static_cast<uint32_t>(pbDump.size()); 394 if (offset >= size) 395 { 396 return {}; 397 } 398 return std::string_view(pbDump.data() + offset, 399 std::min(requestedSize, size - offset)); 400 } 401 402 int BmcHealthSnapshot::getStringID(const std::string_view s) 403 { 404 int ret = 0; 405 auto itr = stringTable.find(s.data()); 406 if (itr == stringTable.end()) 407 { 408 stringTable[s.data()] = stringId; 409 ret = stringId; 410 ++stringId; 411 } 412 else 413 { 414 ret = itr->second; 415 } 416 return ret; 417 } 418 419 } // namespace metric_blob