0x01 limax死锁日志
2018-07-02 18:36:34.611 FATAL <LockEnvironmentScheduler.28> Deadlock detected, cycle-size=2
"limax.zdb.procedure.73" id=73 owns=java.util.concurrent.locks.ReentrantReadWriteLock@5a775712[Write locks = 0, Read locks = 6]
at sun.misc.Unsafe.park(Native Method)
at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireInterruptibly(AbstractQueuedSynchronizer.java:897)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireInterruptibly(AbstractQueuedSynchronizer.java:1222)
at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lockInterruptibly(ReentrantReadWriteLock.java:998)
at limax.util.LockEnvironment$1.lock(LockEnvironment.java:135)
at limax.zdb.Lockey.wLock(Lockey.java:65)
at limax.zdb.TTable.get(TTable.java:101)
at table._Tables_$Syssetting.select(_Tables_.java:1624)
at table.Syssetting.select(Syssetting.java:16)
at game.gameserver.logic.basic.setting.SysSettingOnlineHandler.lambda$onEvent$0(SysSettingOnlineHandler.java:14)
at game.gameserver.logic.basic.setting.SysSettingOnlineHandler$$Lambda$888/800919343.process(Unknown Source)
at limax.zdb.ProcedureImpl.call(ProcedureImpl.java:16)
at limax.zdb.Transaction.perform(Transaction.java:263)
at limax.zdb.ProcedureImpl$ProcedureFuture.run(ProcedureImpl.java:222)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at limax.util.TimeoutExecutor$TimeoutCallable.call(TimeoutExecutor.java:72)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
at limax.util.ConcurrentEnvironment$Worker.run(ConcurrentEnvironment.java:174)
"limax.zdb.procedure.57" id=57 owns=java.util.concurrent.locks.ReentrantReadWriteLock@50dc22af[Write locks = 1, Read locks = 0]
at java.lang.ClassLoader.findBootstrapClass(Native Method)
at java.lang.ClassLoader.findBootstrapClassOrNull(ClassLoader.java:1015)
at java.lang.ClassLoader.loadClass(ClassLoader.java:413)
at java.lang.ClassLoader.loadClass(ClassLoader.java:411)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
at java.lang.ClassLoader.loadClass(ClassLoader.java:411)
at org.springframework.boot.loader.LaunchedURLClassLoader.loadClass(LaunchedURLClassLoader.java:93)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at sun.reflect.generics.factory.CoreReflectionFactory.makeNamedType(CoreReflectionFactory.java:114)
at sun.reflect.generics.visitor.Reifier.visitClassTypeSignature(Reifier.java:125)
at sun.reflect.generics.tree.ClassTypeSignature.accept(ClassTypeSignature.java:49)
at sun.reflect.generics.visitor.Reifier.reifyTypeArguments(Reifier.java:68)
at sun.reflect.generics.visitor.Reifier.visitClassTypeSignature(Reifier.java:138)
at sun.reflect.generics.tree.ClassTypeSignature.accept(ClassTypeSignature.java:49)
at sun.reflect.generics.repository.FieldRepository.getGenericType(FieldRepository.java:85)
at java.lang.reflect.Field.getGenericType(Field.java:247)
at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.getBoundFields(ReflectiveTypeAdapterFactory.java:158)
at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.create(ReflectiveTypeAdapterFactory.java:100)
at com.google.gson.Gson.getAdapter(Gson.java:423)
at com.google.gson.internal.bind.MapTypeAdapterFactory.create(MapTypeAdapterFactory.java:126)
at com.google.gson.Gson.getAdapter(Gson.java:423)
at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.createBoundField(ReflectiveTypeAdapterFactory.java:115)
at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.getBoundFields(ReflectiveTypeAdapterFactory.java:164)
at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.create(ReflectiveTypeAdapterFactory.java:100)
at com.google.gson.Gson.getAdapter(Gson.java:423)
at com.google.gson.Gson.fromJson(Gson.java:886)
at com.google.gson.Gson.fromJson(Gson.java:852)
at com.google.gson.Gson.fromJson(Gson.java:801)
at com.google.gson.Gson.fromJson(Gson.java:773)
at table._Tables_$Role2notifyinfo.unmarshalValue(_Tables_.java:1951)
at table._Tables_$Role2notifyinfo.unmarshalValue(_Tables_.java:1920)
at limax.zdb.TStorage.find(TStorage.java:194)
at limax.zdb.TTable._find(TTable.java:239)
at limax.zdb.TTable.get(TTable.java:105)
at table._Tables_$Role2notifyinfo.select(_Tables_.java:1979)
at table.Role2notifyinfo.select(Role2notifyinfo.java:16)
at game.gameserver.logic.basic.notify.NotifyOnlineHandler.lambda$onEvent$2(NotifyOnlineHandler.java:17)
at game.gameserver.logic.basic.notify.NotifyOnlineHandler$$Lambda$884/26357896.process(Unknown Source)
at limax.zdb.ProcedureImpl.call(ProcedureImpl.java:16)
at limax.zdb.Transaction.perform(Transaction.java:263)
at limax.zdb.ProcedureImpl$ProcedureFuture.run(ProcedureImpl.java:222)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at limax.util.TimeoutExecutor$TimeoutCallable.call(TimeoutExecutor.java:72)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
at limax.util.ConcurrentEnvironment$Worker.run(ConcurrentEnvironment.java:174)
2018-07-02 18:36:34.611 WARN <limax.zdb.procedure.73> Procedure execute
java.lang.IllegalMonitorStateException
at java.util.concurrent.locks.ReentrantReadWriteLock$Sync.tryRelease(ReentrantReadWriteLock.java:371)
at java.util.concurrent.locks.AbstractQueuedSynchronizer.release(AbstractQueuedSynchronizer.java:1261)
at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.unlock(ReentrantReadWriteLock.java:1131)
at limax.util.LockEnvironment$1.unlock(LockEnvironment.java:152)
at limax.zdb.Lockey.wUnlock(Lockey.java:44)
at limax.zdb.TTable.get(TTable.java:121)
at table._Tables_$Syssetting.select(_Tables_.java:1624)
at table.Syssetting.select(Syssetting.java:16)
at game.gameserver.logic.basic.setting.SysSettingOnlineHandler.lambda$onEvent$0(SysSettingOnlineHandler.java:14)
at limax.zdb.ProcedureImpl.call(ProcedureImpl.java:16)
at limax.zdb.Transaction.perform(Transaction.java:263)
at limax.zdb.ProcedureImpl$ProcedureFuture.run(ProcedureImpl.java:222)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at limax.util.TimeoutExecutor$TimeoutCallable.call(TimeoutExecutor.java:72)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
这个死锁很诡异,limax的死锁只能监控Lockey的死锁,但是这里面可以看到,发生死锁时的堆栈在ClassLoader里。
0x02 锁泄露
修改前
protected V get(K key, boolean wlock) {
Objects.requireNonNull(key, "key is null");
Transaction current = Transaction.current();
Lockey lockey = Lockeys.getLockey(lockId, key);
if (wlock)
current.wAddLockey(lockey);
else
current.rAddLockey(lockey);
countGet.incrementAndGet();
TRecord<K, V> rCached = current.getCachedTRecord(this, key);
if (rCached != null)
return rCached.getValue();
Lockey cacheLockey = Lockeys.getLockey(0, lockey);
cacheLockey.wLock();
try {
TRecord<K, V> r = cache.get(key);
if (null == r) {
countGetMiss.incrementAndGet();
V value = _find(key);
if (null == value) {
countGetStorageMiss.incrementAndGet();
return null;
}
r = new TRecord<K, V>(this, value, lockey, TRecord.State.INDB_GET);
cache.addNoLog(key, r);
}
current.addCachedTRecord(this, r);
return r.getValue();
} finally {
cacheLockey.wUnlock();
}
}
// Transaction
void rAddLockey(Lockey lockey) {
if (locks.containsKey(lockey))
return;
lockey.rLock();
locks.put(lockey, new LockeyHolder(lockey, false));
}
修改后
// select过程
// 1、先从Transaction中获取/构造一个Lockey A
// 2、如果是新创建的Lockey A则加读锁
// 3、新创建一个Lockey B并且拿到写锁
// 4、从TTableCache加载Record
// 5、释放Lockey B写锁
// 6、释放Lockey A读锁
protected V get(K key, boolean wlock) {
Objects.requireNonNull(key, "key is null");
Transaction current = Transaction.current();
Lockey lockey = Lockeys.getLockey(lockId, key);
Lockey cacheLockey = null;
boolean rLockSuccess = true;
try {
if (wlock)
current.wAddLockey(lockey);
else {
// 拿读锁的时候,可能会抛出XDeadlock,这样 finally里 就不能执行rUnlock,Xdeadlock
// 这个不能吞掉,需要回滚,所以再次抛出
try {
lockey.rLock();
} catch (XDeadlock e) {
rLockSuccess = false;
throw e;
}
}
countGet.incrementAndGet();
TRecord<K, V> rCached = current.getCachedTRecord(this, key);
if (rCached != null) {
if (wlock) {
return rCached.getValue();
} else {
return doCopy(rCached.getValue());
}
}
cacheLockey = Lockeys.getLockey(0, lockey);
cacheLockey.wLock();
TRecord<K, V> r = cache.get(key);
if (null == r) {
countGetMiss.incrementAndGet();
V value = _find(key);
if (null == value) {
countGetStorageMiss.incrementAndGet();
return null;
}
r = new TRecord<K, V>(this, value, lockey, TRecord.State.INDB_GET);
cache.addNoLog(key, r);
}
current.addCachedTRecord(this, r);
if (wlock) {
return r.getValue();
} else {
return doCopy(r.getValue());
}
} finally {
if (cacheLockey != null) {
cacheLockey.wUnlock();
}
if (!wlock && rLockSuccess) {
lockey.rUnlock();
}
}
}
代码差异: 可以看到主要的修改就是对拿读锁的时候,进行了一次副本拷贝,并且将拿读锁,从rAddLockey改为对读锁rLock的直接锁定然后在finally里去释放读锁。
导致问题: 加入在cacheLockey.wUnlock()的时候抛出异常,那么下面的lockey.rUnlock()就没有办法走到,这会造成这个Lockey的读锁无法释放,姑且称之锁泄露,锁泄露的直接后果就是造成整个系统的checkpoint无法进行,进而内存爆掉。
解决方法: 锁泄露很好解决,finally代码块里再加个try...finally...
嘛。
0x03 死锁原因
归根结底是死锁导致的锁泄露,所以死锁原因还是要查一查的,上死锁检测代码。
DeadlockDetector(Map<Thread, Object> waiting, Map<Thread, Map<Object, Integer>> holding) {
this.waiting = waiting.entrySet().stream().collect(
Collectors.groupingBy(e -> e.getValue(), Collectors.mapping(e -> e.getKey(), Collectors.toSet())));
this.holding = holding.entrySet().stream()
.collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList<>(e.getValue().keySet())));
this.listing = this.holding.entrySet().stream().filter(e -> waiting.containsKey(e.getKey()))
.sorted((a, b) -> a.getValue().size() - b.getValue().size()).map(e -> e.getKey())
.collect(Collectors.toCollection(LinkedHashSet<Thread>::new));
}
// 毫无姨问,这种冲击波代码对我造成的伤害是巨大的。
// victim的条件是既在waiting里,又在holding里的线程集合。
private Collection<Pair<Thread, Object>> test(Thread victim) {
Map<Thread, Pair<Thread, Object>> path = new HashMap<>();
Collection<Thread> tset0 = Arrays.asList(victim);
do {
Collection<Thread> tmp = tset0;
tset0 = null;
for (Thread t0 : tmp) {
Collection<Object> lset = holding.get(t0);
if (lset != null)
for (Object l : lset) {
Collection<Thread> tset1 = waiting.get(l);
if (tset1 != null)
for (Thread t1 : tset1)
// 记录线程的等待关系
if (path.putIfAbsent(t1, new Pair<>(t0, l)) == null) {
if (t1 == victim) {
// 找到了当前检测的线程的等待关系
Deque<Pair<Thread, Object>> list = new ArrayDeque<>();
do {
// 沿着等待关系图一路走下去,找到一个环
Pair<Thread, Object> it = path.get(t1);
list.addFirst(it);
t1 = it.getKey();
} while (t1 != victim);
return list;
}
if (tset0 == null)
tset0 = new ArrayList<>();
tset0.add(t1);
}
}
}
} while (tset0 != null);
return null;
}
public DetectableLock create(Lock lock, Object ref) {
return new DetectableLock() {
@Override
public void lock() throws InterruptedException {
waiting.put(Thread.currentThread(), ref);
try {
lock.lockInterruptibly();
addHolder(ref);
} finally {
waiting.remove(Thread.currentThread());
}
}
...
}
}
根据上面的分析,我们能看到select的时候在get里面会有两个Lockey,一个拿读锁,一个拿写锁,对于lockId相同的表,读锁都会拿到,但是写锁只会有一个线程拿到,而且读锁和写锁的加锁顺序是固定的,并不能构成死锁条件,而且堆栈显示在ClassLoader.findBootstrapClass中,这就说明这个线程不可能在waiting状态,那为什么它会在waiting列表中?思考之后发现问题可能出现在在waiting.put之后到waiting.remove之间,存在一个竞争窗口,这个时间发起了一次死锁检测,因为死锁检测和waiting更新并没有互斥的关系,导致waiting和holding的副本在初始化的时候时序出现问题,最后导致了这次死锁误报的惨案。
解决办法:给曾总送一个鸡腿,让他思考一下。第二天,鸡腿还没送到,曾总给出解决方案,代码如下。
DeadlockDetector(Map<Thread, Object> waiting, Map<Thread, Map<Object, Integer>> holding) {
this.waiting = waiting.entrySet().stream().filter(e -> e.getKey().getState() == Thread.State.WAITING)
.collect(Collectors.groupingBy(e -> e.getValue(),
Collectors.mapping(e -> e.getKey(), Collectors.toSet())));
this.holding = holding.entrySet().stream()
.collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList<>(e.getValue().keySet())));
this.listing = this.holding.entrySet().stream()
.filter(e -> waiting.containsKey(e.getKey()) && e.getKey().getState() == Thread.State.WAITING)
.sorted((a, b) -> a.getValue().size() - b.getValue().size()).map(e -> e.getKey())
.collect(Collectors.toCollection(LinkedHashSet<Thread>::new));
}
代码的修改主要就是增加了初始化waiting和holding副本时,对线程状态的校验,如果未处在线程状态未处在WAITING,则不会将该线程添加到waiting和listing队列中,自然也就不会参与到死锁检测的过程中。