一次死锁引发的思索

Posted by Mr.Cai on July 3, 2018

0x01 limax死锁日志

2018-07-02 18:36:34.611 FATAL <LockEnvironmentScheduler.28> Deadlock detected, cycle-size=2
"limax.zdb.procedure.73" id=73  owns=java.util.concurrent.locks.ReentrantReadWriteLock@5a775712[Write locks = 0, Read locks = 6]
	at sun.misc.Unsafe.park(Native Method)
	at java.util.concurrent.locks.LockSupport.park(LockSupport.java:175)
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.parkAndCheckInterrupt(AbstractQueuedSynchronizer.java:836)
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.doAcquireInterruptibly(AbstractQueuedSynchronizer.java:897)
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.acquireInterruptibly(AbstractQueuedSynchronizer.java:1222)
	at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.lockInterruptibly(ReentrantReadWriteLock.java:998)
	at limax.util.LockEnvironment$1.lock(LockEnvironment.java:135)
	at limax.zdb.Lockey.wLock(Lockey.java:65)
	at limax.zdb.TTable.get(TTable.java:101)
	at table._Tables_$Syssetting.select(_Tables_.java:1624)
	at table.Syssetting.select(Syssetting.java:16)
	at game.gameserver.logic.basic.setting.SysSettingOnlineHandler.lambda$onEvent$0(SysSettingOnlineHandler.java:14)
	at game.gameserver.logic.basic.setting.SysSettingOnlineHandler$$Lambda$888/800919343.process(Unknown Source)
	at limax.zdb.ProcedureImpl.call(ProcedureImpl.java:16)
	at limax.zdb.Transaction.perform(Transaction.java:263)
	at limax.zdb.ProcedureImpl$ProcedureFuture.run(ProcedureImpl.java:222)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at limax.util.TimeoutExecutor$TimeoutCallable.call(TimeoutExecutor.java:72)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
	at limax.util.ConcurrentEnvironment$Worker.run(ConcurrentEnvironment.java:174)
"limax.zdb.procedure.57" id=57  owns=java.util.concurrent.locks.ReentrantReadWriteLock@50dc22af[Write locks = 1, Read locks = 0]
	at java.lang.ClassLoader.findBootstrapClass(Native Method)
	at java.lang.ClassLoader.findBootstrapClassOrNull(ClassLoader.java:1015)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:413)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:411)
	at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:411)
	at org.springframework.boot.loader.LaunchedURLClassLoader.loadClass(LaunchedURLClassLoader.java:93)
	at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
	at java.lang.Class.forName0(Native Method)
	at java.lang.Class.forName(Class.java:348)
	at sun.reflect.generics.factory.CoreReflectionFactory.makeNamedType(CoreReflectionFactory.java:114)
	at sun.reflect.generics.visitor.Reifier.visitClassTypeSignature(Reifier.java:125)
	at sun.reflect.generics.tree.ClassTypeSignature.accept(ClassTypeSignature.java:49)
	at sun.reflect.generics.visitor.Reifier.reifyTypeArguments(Reifier.java:68)
	at sun.reflect.generics.visitor.Reifier.visitClassTypeSignature(Reifier.java:138)
	at sun.reflect.generics.tree.ClassTypeSignature.accept(ClassTypeSignature.java:49)
	at sun.reflect.generics.repository.FieldRepository.getGenericType(FieldRepository.java:85)
	at java.lang.reflect.Field.getGenericType(Field.java:247)
	at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.getBoundFields(ReflectiveTypeAdapterFactory.java:158)
	at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.create(ReflectiveTypeAdapterFactory.java:100)
	at com.google.gson.Gson.getAdapter(Gson.java:423)
	at com.google.gson.internal.bind.MapTypeAdapterFactory.create(MapTypeAdapterFactory.java:126)
	at com.google.gson.Gson.getAdapter(Gson.java:423)
	at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.createBoundField(ReflectiveTypeAdapterFactory.java:115)
	at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.getBoundFields(ReflectiveTypeAdapterFactory.java:164)
	at com.google.gson.internal.bind.ReflectiveTypeAdapterFactory.create(ReflectiveTypeAdapterFactory.java:100)
	at com.google.gson.Gson.getAdapter(Gson.java:423)
	at com.google.gson.Gson.fromJson(Gson.java:886)
	at com.google.gson.Gson.fromJson(Gson.java:852)
	at com.google.gson.Gson.fromJson(Gson.java:801)
	at com.google.gson.Gson.fromJson(Gson.java:773)
	at table._Tables_$Role2notifyinfo.unmarshalValue(_Tables_.java:1951)
	at table._Tables_$Role2notifyinfo.unmarshalValue(_Tables_.java:1920)
	at limax.zdb.TStorage.find(TStorage.java:194)
	at limax.zdb.TTable._find(TTable.java:239)
	at limax.zdb.TTable.get(TTable.java:105)
	at table._Tables_$Role2notifyinfo.select(_Tables_.java:1979)
	at table.Role2notifyinfo.select(Role2notifyinfo.java:16)
	at game.gameserver.logic.basic.notify.NotifyOnlineHandler.lambda$onEvent$2(NotifyOnlineHandler.java:17)
	at game.gameserver.logic.basic.notify.NotifyOnlineHandler$$Lambda$884/26357896.process(Unknown Source)
	at limax.zdb.ProcedureImpl.call(ProcedureImpl.java:16)
	at limax.zdb.Transaction.perform(Transaction.java:263)
	at limax.zdb.ProcedureImpl$ProcedureFuture.run(ProcedureImpl.java:222)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at limax.util.TimeoutExecutor$TimeoutCallable.call(TimeoutExecutor.java:72)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
	at limax.util.ConcurrentEnvironment$Worker.run(ConcurrentEnvironment.java:174)
2018-07-02 18:36:34.611 WARN  <limax.zdb.procedure.73> Procedure execute
java.lang.IllegalMonitorStateException
	at java.util.concurrent.locks.ReentrantReadWriteLock$Sync.tryRelease(ReentrantReadWriteLock.java:371)
	at java.util.concurrent.locks.AbstractQueuedSynchronizer.release(AbstractQueuedSynchronizer.java:1261)
	at java.util.concurrent.locks.ReentrantReadWriteLock$WriteLock.unlock(ReentrantReadWriteLock.java:1131)
	at limax.util.LockEnvironment$1.unlock(LockEnvironment.java:152)
	at limax.zdb.Lockey.wUnlock(Lockey.java:44)
	at limax.zdb.TTable.get(TTable.java:121)
	at table._Tables_$Syssetting.select(_Tables_.java:1624)
	at table.Syssetting.select(Syssetting.java:16)
	at game.gameserver.logic.basic.setting.SysSettingOnlineHandler.lambda$onEvent$0(SysSettingOnlineHandler.java:14)
	at limax.zdb.ProcedureImpl.call(ProcedureImpl.java:16)
	at limax.zdb.Transaction.perform(Transaction.java:263)
	at limax.zdb.ProcedureImpl$ProcedureFuture.run(ProcedureImpl.java:222)
	at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
	at limax.util.TimeoutExecutor$TimeoutCallable.call(TimeoutExecutor.java:72)
	at java.util.concurrent.FutureTask.run(FutureTask.java:266)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)

这个死锁很诡异,limax的死锁只能监控Lockey的死锁,但是这里面可以看到,发生死锁时的堆栈在ClassLoader里。

0x02 锁泄露

修改前

protected V get(K key, boolean wlock) {
	Objects.requireNonNull(key, "key is null");
	Transaction current = Transaction.current();
	Lockey lockey = Lockeys.getLockey(lockId, key);
	if (wlock)
		current.wAddLockey(lockey);
	else
		current.rAddLockey(lockey);
	countGet.incrementAndGet();
	TRecord<K, V> rCached = current.getCachedTRecord(this, key);
	if (rCached != null)
		return rCached.getValue();
	Lockey cacheLockey = Lockeys.getLockey(0, lockey);
	cacheLockey.wLock();
	try {
		TRecord<K, V> r = cache.get(key);
		if (null == r) {
			countGetMiss.incrementAndGet();
			V value = _find(key);
			if (null == value) {
				countGetStorageMiss.incrementAndGet();
				return null;
			}
			r = new TRecord<K, V>(this, value, lockey, TRecord.State.INDB_GET);
			cache.addNoLog(key, r);
		}
		current.addCachedTRecord(this, r);
		return r.getValue();
	} finally {
		cacheLockey.wUnlock();
	}
}

// Transaction
void rAddLockey(Lockey lockey) {
	if (locks.containsKey(lockey))
		return;
	lockey.rLock();
	locks.put(lockey, new LockeyHolder(lockey, false));
}

修改后

 // select过程
 // 1、先从Transaction中获取/构造一个Lockey A
 // 2、如果是新创建的Lockey A则加读锁
 // 3、新创建一个Lockey B并且拿到写锁
 // 4、从TTableCache加载Record
 // 5、释放Lockey B写锁
 // 6、释放Lockey A读锁
protected V get(K key, boolean wlock) {
	Objects.requireNonNull(key, "key is null");
	Transaction current = Transaction.current();
	Lockey lockey = Lockeys.getLockey(lockId, key);
	Lockey cacheLockey = null;

	boolean rLockSuccess = true;
	try {
		if (wlock)
			current.wAddLockey(lockey);
		else {
			// 拿读锁的时候,可能会抛出XDeadlock,这样 finally里 就不能执行rUnlock,Xdeadlock
			// 这个不能吞掉,需要回滚,所以再次抛出
			try {
				lockey.rLock();
			} catch (XDeadlock e) {
				rLockSuccess = false;
				throw e;
			}
		}
		countGet.incrementAndGet();
		TRecord<K, V> rCached = current.getCachedTRecord(this, key);
		if (rCached != null) {
			if (wlock) {
				return rCached.getValue();
			} else {
				return doCopy(rCached.getValue());
			}
		}
		cacheLockey = Lockeys.getLockey(0, lockey);
		cacheLockey.wLock();
		TRecord<K, V> r = cache.get(key);
		if (null == r) {
			countGetMiss.incrementAndGet();
			V value = _find(key);
			if (null == value) {
				countGetStorageMiss.incrementAndGet();
				return null;
			}
			r = new TRecord<K, V>(this, value, lockey, TRecord.State.INDB_GET);
			cache.addNoLog(key, r);
		}
		current.addCachedTRecord(this, r);
		if (wlock) {
			return r.getValue();
		} else {
			return doCopy(r.getValue());
		}
	} finally {
		if (cacheLockey != null) {
			cacheLockey.wUnlock();
		}
		if (!wlock && rLockSuccess) {
			lockey.rUnlock();
		}
	}
}

代码差异: 可以看到主要的修改就是对拿读锁的时候,进行了一次副本拷贝,并且将拿读锁,从rAddLockey改为对读锁rLock的直接锁定然后在finally里去释放读锁。
导致问题: 加入在cacheLockey.wUnlock()的时候抛出异常,那么下面的lockey.rUnlock()就没有办法走到,这会造成这个Lockey的读锁无法释放,姑且称之锁泄露,锁泄露的直接后果就是造成整个系统的checkpoint无法进行,进而内存爆掉。
解决方法: 锁泄露很好解决,finally代码块里再加个try...finally...嘛。

0x03 死锁原因

归根结底是死锁导致的锁泄露,所以死锁原因还是要查一查的,上死锁检测代码。


DeadlockDetector(Map<Thread, Object> waiting, Map<Thread, Map<Object, Integer>> holding) {
	this.waiting = waiting.entrySet().stream().collect(
			Collectors.groupingBy(e -> e.getValue(), Collectors.mapping(e -> e.getKey(), Collectors.toSet())));
	this.holding = holding.entrySet().stream()
			.collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList<>(e.getValue().keySet())));
	this.listing = this.holding.entrySet().stream().filter(e -> waiting.containsKey(e.getKey()))
			.sorted((a, b) -> a.getValue().size() - b.getValue().size()).map(e -> e.getKey())
			.collect(Collectors.toCollection(LinkedHashSet<Thread>::new));
}

 // 毫无姨问,这种冲击波代码对我造成的伤害是巨大的。
 // victim的条件是既在waiting里,又在holding里的线程集合。
private Collection<Pair<Thread, Object>> test(Thread victim) {
	Map<Thread, Pair<Thread, Object>> path = new HashMap<>();
	Collection<Thread> tset0 = Arrays.asList(victim);
	do {
		Collection<Thread> tmp = tset0;
		tset0 = null;
		for (Thread t0 : tmp) {
			Collection<Object> lset = holding.get(t0);
			if (lset != null)
				for (Object l : lset) {
					Collection<Thread> tset1 = waiting.get(l);
					if (tset1 != null)
						for (Thread t1 : tset1)
							// 记录线程的等待关系
							if (path.putIfAbsent(t1, new Pair<>(t0, l)) == null) {
								if (t1 == victim) {
									// 找到了当前检测的线程的等待关系
									Deque<Pair<Thread, Object>> list = new ArrayDeque<>();
									do {
										// 沿着等待关系图一路走下去,找到一个环
										Pair<Thread, Object> it = path.get(t1);
										list.addFirst(it);
										t1 = it.getKey();
									} while (t1 != victim);
									return list;
								}
								if (tset0 == null)
									tset0 = new ArrayList<>();
								tset0.add(t1);
							}
				}
		}
	} while (tset0 != null);
	return null;
}

public DetectableLock create(Lock lock, Object ref) {
	return new DetectableLock() {
		@Override
		public void lock() throws InterruptedException {
			waiting.put(Thread.currentThread(), ref);
			try {
				lock.lockInterruptibly();
				addHolder(ref);
			} finally {
				waiting.remove(Thread.currentThread());
			}
		}
		...
	}
}

根据上面的分析,我们能看到select的时候在get里面会有两个Lockey,一个拿读锁,一个拿写锁,对于lockId相同的表,读锁都会拿到,但是写锁只会有一个线程拿到,而且读锁和写锁的加锁顺序是固定的,并不能构成死锁条件,而且堆栈显示在ClassLoader.findBootstrapClass中,这就说明这个线程不可能在waiting状态,那为什么它会在waiting列表中?思考之后发现问题可能出现在在waiting.put之后到waiting.remove之间,存在一个竞争窗口,这个时间发起了一次死锁检测,因为死锁检测和waiting更新并没有互斥的关系,导致waiting和holding的副本在初始化的时候时序出现问题,最后导致了这次死锁误报的惨案。

解决办法:给曾总送一个鸡腿,让他思考一下。第二天,鸡腿还没送到,曾总给出解决方案,代码如下。

DeadlockDetector(Map<Thread, Object> waiting, Map<Thread, Map<Object, Integer>> holding) {
	this.waiting = waiting.entrySet().stream().filter(e -> e.getKey().getState() == Thread.State.WAITING)
			.collect(Collectors.groupingBy(e -> e.getValue(),
					Collectors.mapping(e -> e.getKey(), Collectors.toSet())));
	this.holding = holding.entrySet().stream()
			.collect(Collectors.toMap(e -> e.getKey(), e -> new ArrayList<>(e.getValue().keySet())));
	this.listing = this.holding.entrySet().stream()
			.filter(e -> waiting.containsKey(e.getKey()) && e.getKey().getState() == Thread.State.WAITING)
			.sorted((a, b) -> a.getValue().size() - b.getValue().size()).map(e -> e.getKey())
			.collect(Collectors.toCollection(LinkedHashSet<Thread>::new));
}

代码的修改主要就是增加了初始化waiting和holding副本时,对线程状态的校验,如果未处在线程状态未处在WAITING,则不会将该线程添加到waiting和listing队列中,自然也就不会参与到死锁检测的过程中。