#lang racket
(require plot pict)
(define (random-distribution labels probs)
(let/ec do
(for/fold ([point (random)]) ([label labels] [prob probs])
(if (<= point prob)
(do label)
(- point prob)))))
(define (threshold-vector! v)
(define threshold (apply max (vector->list v)))
(for ([x v] [i (in-naturals)]) (vector-set! v i (- x threshold))))
(define (vector-sum-lg v)
(for/fold ([sum 0]) ([x v]) (+ sum (exp x))))
;;; AGENTS AND BANDITS
;; We implement both regular *and* blind no-regret dynamics.
;; Agents use regular no-regret dynamics, so they choose an action,
;; observe what would have happened had they chosen differently, and
;; adapt over time.
(define *rate* 0.1)
(struct agent (actions utilities scores))
(define/contract (make-agent actions utility)
(-> (listof symbol?) (-> (listof symbol?) number?) agent?)
(agent actions utility (for/vector ([action actions]) 0.0)))
(define (agent-probabilities ag)
(match-define (agent actions _ scores) ag)
(define sum (for/fold ([sum 0]) ([score scores]) (+ sum (exp score))))
(for/vector ([score scores]) (/ (exp score) sum)))
(define (agent-choose ag)
(match-define (agent actions _ scores) ag)
(random-distribution actions (agent-probabilities ag)))
(define (agent-update! ag rate i done)
(match-define (agent actions utility scores) ag)
(define utilities
(for/vector ([action actions])
(utility (list-set done i action))))
(for ([score scores] [delta utilities] [i (in-naturals)])
(vector-set! scores i (+ score (* rate delta))))
(threshold-vector! scores))
;; Bandits use blind no-regret dynamics, so they choose an action,
;; observe only the reward for that action, and adapt over time. The
;; algorithm is from "Prediction, Learning, and Games" by Nicolo
;; Cesa-Bianchi and Gabor Lugosi
;;
(struct bandit (beta eta gamma labels weights probs choice) #:mutable)
(define (make-bandit n labels)
(define N (length labels))
(define delta (max (/ N (exp (/ n (* 2 N)))) 0.05))
(define beta (sqrt (* (/ 1. (* n N)) (log (/ N delta)))))
(define gamma (/ (* 4 N beta) (+ 3 beta)))
(define eta (/ gamma (* 2 N)))
(eprintf "δ = ~a, β = ~a, γ = ~a, η = ~a\n" delta beta gamma eta)
(bandit beta gamma eta labels
(build-vector N (const 1)) (build-vector N (const (/ 1. N))) #f))
(define (bandit-choose bandit)
(define choice
(random-distribution (bandit-labels bandit) (bandit-probs bandit)))
(set-bandit-choice! bandit choice)
choice)
(define (bandit-update! b reward)
(match-define (bandit beta eta gamma labels weights probs choice) b)
(for ([label labels] [prob probs] [weight weights] [i (in-naturals)])
(define gain
(if (equal? label choice) (/ (+ reward beta) prob) (/ beta prob)))
(vector-set! weights i (+ weight (* eta gain))))
(threshold-vector! weights)
(define sum (vector-sum-lg weights))
(for ([w weights] [i (in-naturals)])
(vector-set! probs i (+ (* (- 1 gamma) (/ (exp w) sum)) (/ gamma (length labels))))))
;;; Games store a set of actions and per-agent utilities
(struct game (actions utilities))
(define (make-game actions utility)
(game
actions
(for/list ([action actions] [i (in-naturals)])
(compose (curryr list-ref i) (curry apply utility)))))
(define pd-actions '((C D) (C D)))
(define/match (pd-utility a b)
[('C 'C) '(5 5)]
[('D 'C) '(6 0)]
[('C 'D) '(0 6)]
[('D 'D) '(1 1)])
(define pd-game (make-game pd-actions pd-utility))
(define/match (pd-utility2 a b)
[('C 'C) '(5 5)]
[('D 'C) '(2 0)]
[('C 'D) '(0 2)]
[('D 'D) '(1 1)])
(define pd-game2 (make-game pd-actions pd-utility2))
;;; GAMES, NORMAL AND MORAL
;; Normal games are run by instantiating bandits to choose actions
;; directly. These then converge to an equilibrium of some sort
(define (run-game gm #:steps [steps 250])
(match-define (game actions utilities) gm)
(define bandits (map (curry make-bandit steps) actions))
(for/list ([step (in-range steps)])
(define choices (map bandit-choose bandits))
(define rewards (map (λ (f) (f choices)) utilities))
(for-each bandit-update! bandits rewards)
(map list choices)))
;; A moral game has multiple plausible utilities, including a
;; distinguished "original" utility. It instantiates several agents,
;; one per plausible utility, for each player, and also a bandit to
;; choose among these players.
(define pd-util+
(list (list (curry apply (compose first pd-utility2))
(curry apply (compose second pd-utility2)))))
(define (run-moral-game gm utils+ #:steps [steps 250])
(match-define (game actions utilities) gm)
(define agents ; One agent per possible utility function
(map (curry map make-agent actions)
(apply map list (cons utilities utils+))))
(define bandits (map (curry make-bandit steps) agents))
(for/list ([step (in-range steps)])
(define subagents (map bandit-choose bandits))
(define idxs
(for/list ([bandit bandits] [subagent subagents])
(index-of (bandit-labels bandit) subagent)))
(define choices (map agent-choose subagents))
(define rewards (map (λ (f) (f choices)) utilities))
(for-each bandit-update! bandits rewards)
(for ([bandit bandits] [subagent subagents] [reward rewards] [i (in-naturals)])
(define prob
(vector-ref (bandit-probs bandit) (index-of (bandit-labels bandit) subagent)))
(agent-update! subagent (/ *rate* prob) i choices))
(map list choices idxs)))
;; This simple function plots the behavior of the agents
(define (plot-results gm dones tmpl)
(define actionss
(for/list ([_ (first dones)] [i (in-naturals)])
(sort
(remove-duplicates (append-map (curryr list-ref i) dones))
string