feat: store shard status in Redis
This commit is contained in:
@@ -21,17 +21,16 @@ public class Misc
|
||||
private readonly IDiscordCache _cache;
|
||||
private readonly CpuStatService _cpu;
|
||||
private readonly IMetrics _metrics;
|
||||
private readonly ModelRepository _repo;
|
||||
private readonly ShardInfoService _shards;
|
||||
private readonly ModelRepository _repo;
|
||||
|
||||
public Misc(BotConfig botConfig, IMetrics metrics, CpuStatService cpu, ShardInfoService shards,
|
||||
ModelRepository repo, IDiscordCache cache)
|
||||
public Misc(BotConfig botConfig, IMetrics metrics, CpuStatService cpu, ModelRepository repo, ShardInfoService shards, IDiscordCache cache)
|
||||
{
|
||||
_botConfig = botConfig;
|
||||
_metrics = metrics;
|
||||
_cpu = cpu;
|
||||
_shards = shards;
|
||||
_repo = repo;
|
||||
_shards = shards;
|
||||
_cache = cache;
|
||||
}
|
||||
|
||||
@@ -64,6 +63,8 @@ public class Misc
|
||||
|
||||
var embed = new EmbedBuilder();
|
||||
|
||||
// todo: these will be inaccurate when the bot is actually multi-process
|
||||
|
||||
var messagesReceived = _metrics.Snapshot.GetForContext("Bot").Meters
|
||||
.FirstOrDefault(m => m.MultidimensionalName == BotMetrics.MessagesReceived.Name)?.Value;
|
||||
if (messagesReceived != null)
|
||||
@@ -85,38 +86,52 @@ public class Misc
|
||||
$"{commandsRun.OneMinuteRate * 60:F1}/m ({commandsRun.FifteenMinuteRate * 60:F1}/m over 15m)",
|
||||
true));
|
||||
|
||||
var isCluster = _botConfig.Cluster != null && _botConfig.Cluster.TotalShards != ctx.Cluster.Shards.Count;
|
||||
|
||||
var counts = await _repo.GetStats();
|
||||
var shards = await _shards.GetShards();
|
||||
|
||||
var shardId = ctx.ShardId;
|
||||
var shardTotal = ctx.Cluster.Shards.Count;
|
||||
var shardUpTotal = _shards.Shards.Where(x => x.Connected).Count();
|
||||
var shardInfo = _shards.GetShardInfo(ctx.ShardId);
|
||||
var shardInfo = shards.Where(s => s.ShardId == ctx.ShardId).First();
|
||||
|
||||
// todo: if we're running multiple processes, it is not useful to get the CPU/RAM usage of just the current one
|
||||
var process = Process.GetCurrentProcess();
|
||||
var memoryUsage = process.WorkingSet64;
|
||||
|
||||
var now = SystemClock.Instance.GetCurrentInstant();
|
||||
var shardUptime = now - shardInfo.LastConnectionTime;
|
||||
var now = SystemClock.Instance.GetCurrentInstant().ToUnixTimeSeconds();
|
||||
var shardUptime = Duration.FromSeconds(now - shardInfo.LastConnection);
|
||||
|
||||
var shardTotal = shards.Count();
|
||||
int shardClusterTotal = ctx.Cluster.Shards.Count;
|
||||
var shardUpTotal = shards.Where(x => x.Up && now - x.LastConnection > 60).Count();
|
||||
|
||||
embed
|
||||
.Field(new Embed.Field("Current shard",
|
||||
$"Shard #{shardId} (of {shardTotal} total, {shardUpTotal} are up)", true))
|
||||
$"Shard #{ctx.ShardId} (of {shardTotal} total,"
|
||||
+ (isCluster ? $" {shardClusterTotal} in this cluster," : "") + $" {shardUpTotal} are up)"
|
||||
, true))
|
||||
.Field(new Embed.Field("Shard uptime",
|
||||
$"{shardUptime.FormatDuration()} ({shardInfo.DisconnectionCount} disconnections)", true))
|
||||
.Field(new Embed.Field("CPU usage", $"{_cpu.LastCpuMeasure:P1}", true))
|
||||
.Field(new Embed.Field("Memory usage", $"{memoryUsage / 1024 / 1024} MiB", true))
|
||||
.Field(new Embed.Field("Latency",
|
||||
$"API: {apiLatency.TotalMilliseconds:F0} ms, shard: {shardInfo.ShardLatency.Milliseconds} ms",
|
||||
true))
|
||||
.Field(new Embed.Field("Total numbers", $" {counts.SystemCount:N0} systems,"
|
||||
+ $" {counts.MemberCount:N0} members,"
|
||||
+ $" {counts.GroupCount:N0} groups,"
|
||||
+ $" {counts.SwitchCount:N0} switches,"
|
||||
+ $" {counts.MessageCount:N0} messages"))
|
||||
.Timestamp(process.StartTime.ToString("O"))
|
||||
.Footer(new Embed.EmbedFooter(
|
||||
$"PluralKit {BuildInfoService.Version} • https://github.com/xSke/PluralKit • Last restarted: "));
|
||||
;
|
||||
$"API: {apiLatency.TotalMilliseconds:F0} ms, shard: {shardInfo.Latency} ms",
|
||||
true));
|
||||
|
||||
embed.Field(new("Total numbers", $" {counts.SystemCount:N0} systems,"
|
||||
+ $" {counts.MemberCount:N0} members,"
|
||||
+ $" {counts.GroupCount:N0} groups,"
|
||||
+ $" {counts.SwitchCount:N0} switches,"
|
||||
+ $" {counts.MessageCount:N0} messages"));
|
||||
|
||||
embed
|
||||
.Footer(new(String.Join(" \u2022 ", new[] {
|
||||
$"PluralKit {BuildInfoService.Version}",
|
||||
(isCluster ? $"Cluster {_botConfig.Cluster.NodeIndex}" : ""),
|
||||
"https://github.com/xSke/PluralKit",
|
||||
"Last restarted:",
|
||||
})))
|
||||
.Timestamp(process.StartTime.ToString("O"));
|
||||
|
||||
await ctx.Rest.EditMessage(msg.ChannelId, msg.Id,
|
||||
new MessageEditRequest { Content = "", Embed = embed.Build() });
|
||||
}
|
||||
|
||||
@@ -54,8 +54,8 @@ public class Init
|
||||
logger.Information("Connecting to database");
|
||||
await services.Resolve<IDatabase>().ApplyMigrations();
|
||||
|
||||
// if we're running single-process, clear any existing shard status from the database
|
||||
await services.Resolve<ModelRepository>().ClearShardStatus();
|
||||
// Clear shard status from Redis
|
||||
await redis.Connection.GetDatabase().KeyDeleteAsync("pluralkit:shardstatus");
|
||||
}
|
||||
|
||||
// Init the bot instance itself, register handlers and such to the client before beginning to connect
|
||||
|
||||
@@ -22,8 +22,15 @@
|
||||
</PropertyGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<PackageReference Include="Google.Protobuf" Version="3.13.0"/>
|
||||
<PackageReference Include="Grpc.Net.ClientFactory" Version="2.32.0" />
|
||||
<PackageReference Include="Grpc.Tools" Version="2.37.0" PrivateAssets="All"/>
|
||||
<PackageReference Include="Humanizer.Core" Version="2.8.26"/>
|
||||
<PackageReference Include="Sentry" Version="3.11.1"/>
|
||||
<PackageReference Include="SixLabors.ImageSharp" Version="1.0.2"/>
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Protobuf Include="../proto/discord.proto" GrpcServices="Client" Link="Protos/discord.proto"/>
|
||||
</ItemGroup>
|
||||
</Project>
|
||||
|
||||
@@ -1,11 +1,12 @@
|
||||
using System.Net.WebSockets;
|
||||
|
||||
using App.Metrics;
|
||||
using Google.Protobuf;
|
||||
|
||||
using Myriad.Gateway;
|
||||
|
||||
using NodaTime;
|
||||
using NodaTime.Extensions;
|
||||
|
||||
using StackExchange.Redis;
|
||||
|
||||
using PluralKit.Core;
|
||||
|
||||
@@ -13,30 +14,20 @@ using Serilog;
|
||||
|
||||
namespace PluralKit.Bot;
|
||||
|
||||
// TODO: how much of this do we need now that we have logging in the shard library?
|
||||
// A lot could probably be cleaned up...
|
||||
public class ShardInfoService
|
||||
{
|
||||
private readonly Cluster _client;
|
||||
|
||||
private readonly IDatabase _db;
|
||||
private readonly ILogger _logger;
|
||||
|
||||
private readonly IMetrics _metrics;
|
||||
private readonly ModelRepository _repo;
|
||||
private readonly Cluster _client;
|
||||
private readonly RedisService _redis;
|
||||
private readonly Dictionary<int, ShardInfo> _shardInfo = new();
|
||||
|
||||
public ShardInfoService(ILogger logger, Cluster client, IMetrics metrics, IDatabase db, ModelRepository repo)
|
||||
public ShardInfoService(ILogger logger, Cluster client, RedisService redis)
|
||||
{
|
||||
_client = client;
|
||||
_metrics = metrics;
|
||||
_db = db;
|
||||
_repo = repo;
|
||||
_logger = logger.ForContext<ShardInfoService>();
|
||||
_client = client;
|
||||
_redis = redis;
|
||||
}
|
||||
|
||||
public ICollection<ShardInfo> Shards => _shardInfo.Values;
|
||||
|
||||
public void Init()
|
||||
{
|
||||
// We initialize this before any shards are actually created and connected
|
||||
@@ -44,109 +35,109 @@ public class ShardInfoService
|
||||
_client.ShardCreated += InitializeShard;
|
||||
}
|
||||
|
||||
private void ReportShardStatus()
|
||||
public async Task<IEnumerable<ShardState>> GetShards()
|
||||
{
|
||||
foreach (var (id, shard) in _shardInfo)
|
||||
_metrics.Measure.Gauge.SetValue(BotMetrics.ShardLatency, new MetricTags("shard", id.ToString()),
|
||||
shard.ShardLatency.TotalMilliseconds);
|
||||
_metrics.Measure.Gauge.SetValue(BotMetrics.ShardsConnected, _shardInfo.Count(s => s.Value.Connected));
|
||||
var db = _redis.Connection.GetDatabase();
|
||||
var redisInfo = await db.HashGetAllAsync("pluralkit:shardstatus");
|
||||
return redisInfo.Select(x => Proto.Unmarshal<ShardState>(x.Value));
|
||||
}
|
||||
|
||||
private void InitializeShard(Shard shard)
|
||||
{
|
||||
// Get or insert info in the client dict
|
||||
if (_shardInfo.TryGetValue(shard.ShardId, out var info))
|
||||
_ = Inner();
|
||||
|
||||
async Task Inner()
|
||||
{
|
||||
var db = _redis.Connection.GetDatabase();
|
||||
var redisInfo = await db.HashGetAsync("pluralkit::shardstatus", shard.ShardId);
|
||||
|
||||
// Skip adding listeners if we've seen this shard & already added listeners to it
|
||||
if (info.HasAttachedListeners)
|
||||
if (redisInfo.HasValue)
|
||||
return;
|
||||
|
||||
// latency = 0 because otherwise shard 0 would serialize to an empty array, thanks protobuf
|
||||
var state = new ShardState() { ShardId = shard.ShardId, Up = false, Latency = 1 };
|
||||
|
||||
// Register listeners for new shard
|
||||
shard.Resumed += () => ReadyOrResumed(shard);
|
||||
shard.Ready += () => ReadyOrResumed(shard);
|
||||
shard.SocketClosed += (closeStatus, message) => SocketClosed(shard, closeStatus, message);
|
||||
shard.HeartbeatReceived += latency => Heartbeated(shard, latency);
|
||||
|
||||
// Register that we've seen it
|
||||
await db.HashSetAsync("pluralkit:shardstatus", state.HashWrapper());
|
||||
}
|
||||
else
|
||||
{
|
||||
_shardInfo[shard.ShardId] = info = new ShardInfo();
|
||||
}
|
||||
|
||||
// Call our own SocketOpened listener manually (and then attach the listener properly)
|
||||
|
||||
// Register listeners for new shards
|
||||
shard.Resumed += () => ReadyOrResumed(shard);
|
||||
shard.Ready += () => ReadyOrResumed(shard);
|
||||
shard.SocketClosed += (closeStatus, message) => SocketClosed(shard, closeStatus, message);
|
||||
shard.HeartbeatReceived += latency => Heartbeated(shard, latency);
|
||||
|
||||
// Register that we've seen it
|
||||
info.HasAttachedListeners = true;
|
||||
}
|
||||
|
||||
private ShardInfo TryGetShard(Shard shard)
|
||||
private async Task<ShardState?> TryGetShard(Shard shard)
|
||||
{
|
||||
// If we haven't seen this shard before, add it to the dict!
|
||||
// I don't think this will ever occur since the shard number is constant up-front and we handle those
|
||||
// in the RefreshShardList handler above but you never know, I guess~
|
||||
if (!_shardInfo.TryGetValue(shard.ShardId, out var info))
|
||||
_shardInfo[shard.ShardId] = info = new ShardInfo();
|
||||
return info;
|
||||
var db = _redis.Connection.GetDatabase();
|
||||
var redisInfo = await db.HashGetAsync("pluralkit:shardstatus", shard.ShardId);
|
||||
if (redisInfo.HasValue)
|
||||
return Proto.Unmarshal<ShardState>(redisInfo);
|
||||
return null;
|
||||
}
|
||||
|
||||
private void ReadyOrResumed(Shard shard)
|
||||
{
|
||||
var info = TryGetShard(shard);
|
||||
info.LastConnectionTime = SystemClock.Instance.GetCurrentInstant();
|
||||
info.Connected = true;
|
||||
ReportShardStatus();
|
||||
|
||||
_ = ExecuteWithDatabase(async c =>
|
||||
_ = DoAsync(async () =>
|
||||
{
|
||||
await _repo.SetShardStatus(c, shard.ShardId, PKShardInfo.ShardStatus.Up);
|
||||
await _repo.RegisterShardConnection(c, shard.ShardId);
|
||||
var info = await TryGetShard(shard);
|
||||
|
||||
info.LastConnection = (int)SystemClock.Instance.GetCurrentInstant().ToUnixTimeSeconds();
|
||||
info.Up = true;
|
||||
|
||||
var db = _redis.Connection.GetDatabase();
|
||||
await db.HashSetAsync("pluralkit:shardstatus", info.HashWrapper());
|
||||
});
|
||||
}
|
||||
|
||||
private void SocketClosed(Shard shard, WebSocketCloseStatus? closeStatus, string message)
|
||||
{
|
||||
var info = TryGetShard(shard);
|
||||
info.DisconnectionCount++;
|
||||
info.Connected = false;
|
||||
ReportShardStatus();
|
||||
_ = DoAsync(async () =>
|
||||
{
|
||||
var info = await TryGetShard(shard);
|
||||
|
||||
_ = ExecuteWithDatabase(c =>
|
||||
_repo.SetShardStatus(c, shard.ShardId, PKShardInfo.ShardStatus.Down));
|
||||
info.DisconnectionCount++;
|
||||
info.Up = false;
|
||||
|
||||
var db = _redis.Connection.GetDatabase();
|
||||
await db.HashSetAsync("pluralkit:shardstatus", info.HashWrapper());
|
||||
});
|
||||
}
|
||||
|
||||
private void Heartbeated(Shard shard, TimeSpan latency)
|
||||
{
|
||||
var info = TryGetShard(shard);
|
||||
info.LastHeartbeatTime = SystemClock.Instance.GetCurrentInstant();
|
||||
info.Connected = true;
|
||||
info.ShardLatency = latency.ToDuration();
|
||||
_ = DoAsync(async () =>
|
||||
{
|
||||
var info = await TryGetShard(shard);
|
||||
|
||||
_ = ExecuteWithDatabase(c =>
|
||||
_repo.RegisterShardHeartbeat(c, shard.ShardId, latency.ToDuration()));
|
||||
info.LastHeartbeat = (int)SystemClock.Instance.GetCurrentInstant().ToUnixTimeSeconds();
|
||||
info.Up = true;
|
||||
info.Latency = (int)latency.TotalMilliseconds;
|
||||
|
||||
var db = _redis.Connection.GetDatabase();
|
||||
await db.HashSetAsync("pluralkit:shardstatus", info.HashWrapper());
|
||||
});
|
||||
}
|
||||
|
||||
private async Task ExecuteWithDatabase(Func<IPKConnection, Task> fn)
|
||||
private async Task DoAsync(Func<Task> fn)
|
||||
{
|
||||
// wrapper function to log errors because we "async void" it at call site :(
|
||||
try
|
||||
{
|
||||
await using var conn = await _db.Obtain();
|
||||
await fn(conn);
|
||||
await fn();
|
||||
}
|
||||
catch (Exception e)
|
||||
{
|
||||
_logger.Error(e, "Error persisting shard status");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public ShardInfo GetShardInfo(int shardId) => _shardInfo[shardId];
|
||||
|
||||
public class ShardInfo
|
||||
{
|
||||
public bool Connected;
|
||||
public int DisconnectionCount;
|
||||
public bool HasAttachedListeners;
|
||||
public Instant LastConnectionTime;
|
||||
public Instant LastHeartbeatTime;
|
||||
public Duration ShardLatency;
|
||||
}
|
||||
public static class RedisExt
|
||||
{
|
||||
// convenience method
|
||||
public static HashEntry[] HashWrapper(this ShardState state)
|
||||
=> new[] { new HashEntry(state.ShardId, state.ToByteArray()) };
|
||||
}
|
||||
@@ -2,6 +2,32 @@
|
||||
"version": 1,
|
||||
"dependencies": {
|
||||
"net6.0": {
|
||||
"Google.Protobuf": {
|
||||
"type": "Direct",
|
||||
"requested": "[3.13.0, )",
|
||||
"resolved": "3.13.0",
|
||||
"contentHash": "/6VgKCh0P59x/rYsBkCvkUanF0TeUYzwV9hzLIWgt23QRBaKHoxaaMkidEWhKibLR88c3PVCXyyrx9Xlb+Ne6w==",
|
||||
"dependencies": {
|
||||
"System.Memory": "4.5.2",
|
||||
"System.Runtime.CompilerServices.Unsafe": "4.5.2"
|
||||
}
|
||||
},
|
||||
"Grpc.Net.ClientFactory": {
|
||||
"type": "Direct",
|
||||
"requested": "[2.32.0, )",
|
||||
"resolved": "2.32.0",
|
||||
"contentHash": "ixqSWxPK49P+5z6M2dDBHca0k+sXFe2KHHTJK3P+YXp6QOTHv5CHxNdaW8GrFF34Eh1FJ56Q2ADe383+FEAp6Q==",
|
||||
"dependencies": {
|
||||
"Grpc.Net.Client": "2.32.0",
|
||||
"Microsoft.Extensions.Http": "3.0.3"
|
||||
}
|
||||
},
|
||||
"Grpc.Tools": {
|
||||
"type": "Direct",
|
||||
"requested": "[2.37.0, )",
|
||||
"resolved": "2.37.0",
|
||||
"contentHash": "cud/urkbw3QoQ8+kNeCy2YI0sHrh7td/1cZkVbH6hDLIXX7zzmJbV/KjYSiqiYtflQf+S5mJPLzDQWScN/QdDg=="
|
||||
},
|
||||
"Humanizer.Core": {
|
||||
"type": "Direct",
|
||||
"requested": "[2.8.26, )",
|
||||
@@ -135,6 +161,32 @@
|
||||
"System.Diagnostics.DiagnosticSource": "4.5.1"
|
||||
}
|
||||
},
|
||||
"Grpc.Core.Api": {
|
||||
"type": "Transitive",
|
||||
"resolved": "2.32.0",
|
||||
"contentHash": "t9H6P/oYA4ZQI4fWq4eEwq2GmMNqmOSRfz5+YIat7pQuFmz1hRC2Vq/fL9ZVV1mjd5kHqBlhupMdlsBOsaxeEw==",
|
||||
"dependencies": {
|
||||
"System.Memory": "4.5.3"
|
||||
}
|
||||
},
|
||||
"Grpc.Net.Client": {
|
||||
"type": "Transitive",
|
||||
"resolved": "2.32.0",
|
||||
"contentHash": "T4lKl51ahaSprLcgoZvgn8zYwh834DpaPnrDs6jBRdipL2NHIAC0rPeE7UyzDp/lzv4Xll2tw1u65Fg9ckvErg==",
|
||||
"dependencies": {
|
||||
"Grpc.Net.Common": "2.32.0",
|
||||
"Microsoft.Extensions.Logging.Abstractions": "3.0.3",
|
||||
"System.Diagnostics.DiagnosticSource": "4.5.1"
|
||||
}
|
||||
},
|
||||
"Grpc.Net.Common": {
|
||||
"type": "Transitive",
|
||||
"resolved": "2.32.0",
|
||||
"contentHash": "vDsgy6fs+DlsylppjK9FBGTMMUe8vfAmaURV7ZTurM27itr8qBwymgqmwnVB2hcP1q35NqKx2NvPGe5S2IEnDw==",
|
||||
"dependencies": {
|
||||
"Grpc.Core.Api": "2.32.0"
|
||||
}
|
||||
},
|
||||
"IPNetwork2": {
|
||||
"type": "Transitive",
|
||||
"resolved": "2.5.381",
|
||||
@@ -262,6 +314,16 @@
|
||||
"resolved": "3.1.10",
|
||||
"contentHash": "TzHIUBWnzsViPS/20DnC6wf5kXdRAUZlIYwTYOT9S6heuOA4Re//UmHWsDR3PusAzly5dkdDW0RV0dDZ2vEebQ=="
|
||||
},
|
||||
"Microsoft.Extensions.Http": {
|
||||
"type": "Transitive",
|
||||
"resolved": "3.0.3",
|
||||
"contentHash": "dcyB8szIcSynjVZRuFgqkZpPgTc5zeRSj1HMXSmNqWbHYKiPYJl8ZQgBHz6wmZNSUUNGpCs5uxUg8DZHHDC1Ew==",
|
||||
"dependencies": {
|
||||
"Microsoft.Extensions.DependencyInjection.Abstractions": "3.0.3",
|
||||
"Microsoft.Extensions.Logging": "3.0.3",
|
||||
"Microsoft.Extensions.Options": "3.0.3"
|
||||
}
|
||||
},
|
||||
"Microsoft.Extensions.Logging": {
|
||||
"type": "Transitive",
|
||||
"resolved": "3.1.10",
|
||||
@@ -941,6 +1003,11 @@
|
||||
"System.Threading": "4.3.0"
|
||||
}
|
||||
},
|
||||
"System.Memory": {
|
||||
"type": "Transitive",
|
||||
"resolved": "4.5.3",
|
||||
"contentHash": "3oDzvc/zzetpTKWMShs1AADwZjQ/36HnsufHRPcOjyRAAMLDlu2iD33MBI2opxnezcVUtXyqDXXjoFMOU9c7SA=="
|
||||
},
|
||||
"System.Net.Http": {
|
||||
"type": "Transitive",
|
||||
"resolved": "4.3.0",
|
||||
@@ -1469,6 +1536,7 @@
|
||||
"Autofac.Extensions.DependencyInjection": "7.1.0",
|
||||
"Dapper": "2.0.35",
|
||||
"Dapper.Contrib": "2.0.35",
|
||||
"Google.Protobuf": "3.13.0",
|
||||
"Microsoft.Extensions.Caching.Memory": "3.1.10",
|
||||
"Microsoft.Extensions.Configuration": "3.1.10",
|
||||
"Microsoft.Extensions.Configuration.Binder": "3.1.10",
|
||||
|
||||
Reference in New Issue
Block a user