entry #1
written by kimapr
submitted at
0 likes
guesses
- kimapr (by oleander)
- oleander (by Makefile_dot_in)
comments 0
battleship.py ASCII text
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 | from learner import Learner import torch import random # train the gamer W = 10 H = 10 class HalfBattleship: def __init__(self): self.W = W self.H = H self.board = [None for i in range(W*H)] self.hits = [None for i in range(W*H)] self.pieces = [] for l in [2,3,4,5,6]: while True: h = random.randint(0, 1) == 1 if h: x = random.randrange(0, self.W - l + 1) y = random.randrange(0, self.H) row = [(x+dx,y) for dx in range(0, l)] else: x = random.randrange(0, self.W) y = random.randrange(0, self.H - l + 1) row = [(x,y+dy) for dy in range(0, l)] if len([v for v in filter(lambda o: self.get(o[0], o[1])[0] is not None, row)]) == 0: for x, y in row: self.board[x + y*self.W] = True self.pieces.append((x,y,h)) break def get(self, x, y): return (self.board[x + y*self.W], self.hits[x + y*self.W]) def state(self): return [item for it in [ [ 1 if x is not None and x else 0, 1 if x is not None and not x else 0 ] for x in self.hits ] for item in it] def actions(self): return [(x,y) if self.get(x,y)[1] is None else None for y in range(self.H) for x in range(self.W)] def get_str(self, x, y): piece, hit = self.get(x, y) if hit is not None: return '#' if hit else '.' return 'o' if piece is not None else ' ' def print(self, other): if self.W != other.W or self.H != other.H: raise ValueError('dimensions unequal') print('/'+''.join(['-' for i in range(0,self.W+self.W-1)])+'\\ /'+ ''.join(['-' for i in range(0,other.W+other.W-1)])+'\\') for y in range(self.H): print("|" + (' '.join([self.get_str(x,y) for x in range(self.W)])) + "| |" + (' '.join([other.get_str(x,y) for x in range(other.W)]))+'|') print('\\'+''.join(['-' for i in range(0,self.W+self.W-1)])+'/ \\'+ ''.join(['-' for i in range(0,other.W+other.W-1)])+'/') def play(self, x, y): piece, hit = self.get(x, y) if hit is not None: return None hit = True if piece else False self.hits[x + y*self.W] = hit if len([v for v in filter(lambda v: v[0] is not None and v[1] is None, [self.get(x,y) for y in range(self.H) for x in range(self.W)])]) == 0: return (True, True) return (False, hit) if __name__ == '__main__': state = None try: state = torch.load("model-bship.pt", weights_only = True) except: pass players = [1,2] games = [HalfBattleship() for pl in players] states = [None for pl in players] acts = [None for pl in players] next_states = [None for pl in players] agent = Learner(len(games[0].actions()), len(games[0].state()), state = state, EPS_DECAY=10000) while True: for i, pl in enumerate(players): print() game = games[i] states[i] = game.state() next_states[i] = None state = states[i] act = None actions = game.actions() while act is None: act = agent.decide(state) if actions[act] is None: print('badact', act) agent.learn(state, act, state, -1) act = None acts[i] = act term, winner = game.play(actions[act][0], actions[act][1]) print("state:",(term,winner)) next_states[i] = game.state() print("learn", 1 if winner else -0.1) agent.learn(states[i], acts[i], next_states[i], 1 if winner else 0) other_reward = 0 if term and winner: print() print("@@@@") print("@@@@") print("@@@@ WINNER :"+str(pl)+" @@@@") print("@@@@") print("@@@@") print() coverage = len([v for v in filter(lambda v: v is not None, game.hits)]) / len(game.hits) print(str(round(coverage*100))+"% covered") games[0].print(games[1]) torch.save(agent.get_model_state(), "model-bship.pt") if term or coverage > 0.6: games = [HalfBattleship() for pl in players] states = [None for pl in players] acts = [None for pl in players] next_states = [None for pl in players] break |
gamer.py ASCII text
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 | from learner import Learner from battleship import HalfBattleship import torch import random state = None try: state = torch.load("model-bship.pt", weights_only = True) except: pass game = HalfBattleship() agent = Learner(len(game.actions()), len(game.state()), state = state, layers = [100,100]) for x, y, h in game.pieces: print(('-' if h else '|')+("ABCDEFGHIJ")[y]+("0123456789")[x]) gaming = True while gaming: act = None actions = game.actions() act = agent.decide(game.state(), final=True) while actions[act] is None: act = random.randrange(0, len(actions)) print(("ABCDEFGHIJ")[actions[act][1]]+("0123456789")[actions[act][0]]) while True: try: line = input() except: gaming = False break if line == "WIN": gaming = False break if line == "MISS" or line == "HIT": game.hits[actions[act][0] + actions[act][1]*game.W] = line == "HIT" break |
learner.py ASCII text
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | # i dont know the math i stole it from https://docs.pytorch.org/tutorials/intermediate/reinforcement_q_learning.html import math import random from collections import namedtuple, deque from itertools import count import torch import torch.nn as nn import torch.optim as optim import torch.nn.functional as F device = torch.device( "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu" ) Transition = namedtuple('Transition', ('state', 'action', 'next_state', 'reward')) class ReplayMemory(object): def __init__(self, capacity): self.memory = deque([], maxlen=capacity) def push(self, *args): self.memory.append(Transition(*args)) def sample(self, batch_size): return random.sample(self.memory, batch_size) def __len__(self): return len(self.memory) class DQN(nn.Module): def __init__(self, n_observations, n_actions, layers): super(DQN, self).__init__() self.layers = []; self.layers.append(nn.Linear(n_observations, layers[0])) for i, lay in enumerate(layers[:-1]): self.layers.append(nn.Linear(lay, layers[i+1])) self.layers.append(nn.Linear(layers[len(layers) - 1], n_actions)) self.layers = nn.Sequential(*[item for it in [[l, nn.ReLU()] for l in self.layers] for item in it][:-1]) def forward(self, x): return self.layers(x) class Learner: def __init__(self, n_actions, n_observations, state = None, layers = [128, 128], BATCH_SIZE = 128, GAMMA = 0.99, EPS_START = 0.9, EPS_END = 0.05, EPS_DECAY = 1000, TAU = 0.005, LR = 1e-4 ): self.BATCH_SIZE = BATCH_SIZE self.GAMMA = GAMMA self.EPS_START = EPS_START self.EPS_END = EPS_END self.EPS_DECAY = EPS_DECAY self.TAU = TAU self.LR = LR self.n_observations = n_observations self.n_actions = n_actions if state is not None and 'layers' in state: layers = state['layers']; self.layers = layers self.policy_net = DQN(n_observations, n_actions, layers).to(device) self.steps_done = 0 if state is not None: self.policy_net.load_state_dict(state['model']) self.steps_done = state['steps_done'] self.target_net = DQN(n_observations, n_actions, layers).to(device) self.target_net.load_state_dict(self.policy_net.state_dict()) self.state = None self.optimizer = optim.AdamW(self.policy_net.parameters(), lr=self.LR, amsgrad=True) self.memory = ReplayMemory(10000) def get_model_state(self): return { 'model': self.policy_net.state_dict(), 'steps_done': self.steps_done, 'layers': self.layers } def select_action(self, state, final=False): sample = random.random() if final: eps_threshold = 0 else: eps_threshold = self.EPS_END + (self.EPS_START - self.EPS_END) * \ math.exp(-1. * self.steps_done / self.EPS_DECAY) self.steps_done += 1 if sample > eps_threshold: with torch.no_grad(): return self.policy_net(state).max(1).indices.view(1, 1) else: return torch.tensor([[random.randrange(0, self.n_actions)]], device=device, dtype=torch.long) def optimize_model(self): memory = self.memory optimizer = self.optimizer policy_net = self.policy_net if len(memory) < self.BATCH_SIZE: return transitions = memory.sample(self.BATCH_SIZE) batch = Transition(*zip(*transitions)) non_final_mask = torch.tensor(tuple(map(lambda s: s is not None, batch.next_state)), device=device, dtype=torch.bool) non_final_next_states = torch.cat([s for s in batch.next_state if s is not None]) state_batch = torch.cat(batch.state) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) state_action_values = policy_net(state_batch).gather(1, action_batch) next_state_values = torch.zeros(self.BATCH_SIZE, device=device) with torch.no_grad(): next_state_values[non_final_mask] = self.target_net(non_final_next_states).max(1).values expected_state_action_values = (next_state_values * self.GAMMA) + reward_batch criterion = nn.SmoothL1Loss() loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1)) optimizer.zero_grad() loss.backward() torch.nn.utils.clip_grad_value_(policy_net.parameters(), 100) optimizer.step() def decide(self, state, final=False): state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) action = self.select_action(state, final=False) return action def learn(self, state, action, observation, reward): state = torch.tensor(state, dtype=torch.float32, device=device).unsqueeze(0) if not observation: next_state = None else: next_state = torch.tensor(observation, dtype=torch.float32, device=device).unsqueeze(0) reward = torch.tensor([reward], device=device) self.memory.push(state, action, next_state, reward) self.optimize_model() target_net_state_dict = self.target_net.state_dict() policy_net_state_dict = self.policy_net.state_dict() for key in policy_net_state_dict: target_net_state_dict[key] = policy_net_state_dict[key]*self.TAU + target_net_state_dict[key]*(1-self.TAU) self.target_net.load_state_dict(target_net_state_dict) |
model-ttt.pt data
ttt.py ASCII text
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 | from learner import Learner import torch # its for initial test def checkrow(row): if len(row) == 0: return None if row[0] is None: return None for x in row: if x != row[0]: return None return row[0] class TicTacToe: def __init__(self, W, H, L): self.W = W self.H = H self.L = L self.board = [None for i in range(W*H)] def get(self, x, y): return self.board[x + y*self.W] def state(self, pl): return [item for it in [ [ 1 if x == pl else 0, 1 if x is not None and x != pl else 0 ] for x in self.board ] for item in it] def actions(self): return [(x,y) if self.get(x,y) is None else None for y in range(self.H) for x in range(self.W)] def print(self): print("===") for y in range(self.H): print(" " + (' '.join([str(self.get(x,y)) if self.get(x,y) is not None else '.' for x in range(self.W)]))) print("===") def play(self, x, y, pl): v = self.get(x, y) if v is not None: return None self.board[x + y*self.W] = pl # rows for x, y in [(x,y) for y in range(self.H) for x in range(self.W - self.L + 1)]: row = [self.get(x+d, y) for d in range(self.L)] row = checkrow(row) if row is not None: return (True, row) # cols for x, y in [(x,y) for y in range(self.H - self.L + 1) for x in range(self.W)]: col = [self.get(x, y+d) for d in range(self.L)] col = checkrow(col) if col is not None: return (True, col) # diags for x, y in [(x,y) for y in range(self.H - self.L + 1) for x in range(self.W - self.L + 1)]: dia1 = [self.get(x+dx, y+dy) for dy in range(self.L) for dx in range(self.L)] dia2 = [self.get(x+self.L-1-dx, y+dy) for dy in range(self.L) for dx in range(self.L)] dia1 = checkrow(dia1) dia2 = checkrow(dia2) if dia1 is not None: return (True, dia1) if dia2 is not None: return (True, dia2) if len([i for i in filter(lambda x: x is not None, self.actions())]) == 0: return (True, None) return (False, None) W = 3 H = 3 L = 3 game = TicTacToe(W,H,L) state = None try: state = torch.load("model-ttt.pt", weights_only = True) except: pass agent = Learner(len(game.actions()), len(game.state(1)), state = state) players = [1,2] states = [None for pl in players] acts = [None for pl in players] next_states = [None for pl in players] while True: for i, pl in enumerate(players): print() states[i] = game.state(pl) next_states[i] = None state = states[i] act = None actions = game.actions() while act is None: act = agent.decide(state) if actions[act] is None: print('badact', act) agent.learn(state, act, None, -0.5) act = None acts[i] = act term, winner = game.play(actions[act][0], actions[act][1], pl) print("state:",(term,winner)) next_states[i] = game.state(pl) other_reward = 0 if term and winner == pl: print() print("@@@@") print("@@@@") print("@@@@ WINNER :"+str(pl)+" @@@@") print("@@@@") print("@@@@") print() other_reward = -1 print('learn', 1) agent.learn(states[i], acts[i], next_states[i], 1) elif term: print('learn', 0) agent.learn(states[i], acts[i], next_states[i], 0) for j, pln in filter(lambda p: p[1] != pl, enumerate(players)): if acts[j] is not None: print('learn', other_reward) agent.learn(states[j], acts[j], next_states[j], other_reward) game.print() if term: states = [None for pl in players] acts = [None for pl in players] next_states = [None for pl in players] game = TicTacToe(W,H,L) torch.save(agent.get_model_state(), "model-ttt.pt") break |
post a comment