#lang racket (require plot pict) (define (random-distribution labels probs) (let/ec do (for/fold ([point (random)]) ([label labels] [prob probs]) (if (<= point prob) (do label) (- point prob))))) (define (threshold-vector! v) (define threshold (apply max (vector->list v))) (for ([x v] [i (in-naturals)]) (vector-set! v i (- x threshold)))) (define (vector-sum-lg v) (for/fold ([sum 0]) ([x v]) (+ sum (exp x)))) ;;; AGENTS AND BANDITS ;; We implement both regular *and* blind no-regret dynamics. ;; Agents use regular no-regret dynamics, so they choose an action, ;; observe what would have happened had they chosen differently, and ;; adapt over time. (define *rate* 0.1) (struct agent (actions utilities scores)) (define/contract (make-agent actions utility) (-> (listof symbol?) (-> (listof symbol?) number?) agent?) (agent actions utility (for/vector ([action actions]) 0.0))) (define (agent-probabilities ag) (match-define (agent actions _ scores) ag) (define sum (for/fold ([sum 0]) ([score scores]) (+ sum (exp score)))) (for/vector ([score scores]) (/ (exp score) sum))) (define (agent-choose ag) (match-define (agent actions _ scores) ag) (random-distribution actions (agent-probabilities ag))) (define (agent-update! ag rate i done) (match-define (agent actions utility scores) ag) (define utilities (for/vector ([action actions]) (utility (list-set done i action)))) (for ([score scores] [delta utilities] [i (in-naturals)]) (vector-set! scores i (+ score (* rate delta)))) (threshold-vector! scores)) ;; Bandits use blind no-regret dynamics, so they choose an action, ;; observe only the reward for that action, and adapt over time. The ;; algorithm is from "Prediction, Learning, and Games" by Nicolo ;; Cesa-Bianchi and Gabor Lugosi ;; (struct bandit (beta eta gamma labels weights probs choice) #:mutable) (define (make-bandit n labels) (define N (length labels)) (define delta (max (/ N (exp (/ n (* 2 N)))) 0.05)) (define beta (sqrt (* (/ 1. (* n N)) (log (/ N delta))))) (define gamma (/ (* 4 N beta) (+ 3 beta))) (define eta (/ gamma (* 2 N))) (eprintf "δ = ~a, β = ~a, γ = ~a, η = ~a\n" delta beta gamma eta) (bandit beta gamma eta labels (build-vector N (const 1)) (build-vector N (const (/ 1. N))) #f)) (define (bandit-choose bandit) (define choice (random-distribution (bandit-labels bandit) (bandit-probs bandit))) (set-bandit-choice! bandit choice) choice) (define (bandit-update! b reward) (match-define (bandit beta eta gamma labels weights probs choice) b) (for ([label labels] [prob probs] [weight weights] [i (in-naturals)]) (define gain (if (equal? label choice) (/ (+ reward beta) prob) (/ beta prob))) (vector-set! weights i (+ weight (* eta gain)))) (threshold-vector! weights) (define sum (vector-sum-lg weights)) (for ([w weights] [i (in-naturals)]) (vector-set! probs i (+ (* (- 1 gamma) (/ (exp w) sum)) (/ gamma (length labels)))))) ;;; Games store a set of actions and per-agent utilities (struct game (actions utilities)) (define (make-game actions utility) (game actions (for/list ([action actions] [i (in-naturals)]) (compose (curryr list-ref i) (curry apply utility))))) (define pd-actions '((C D) (C D))) (define/match (pd-utility a b) [('C 'C) '(5 5)] [('D 'C) '(6 0)] [('C 'D) '(0 6)] [('D 'D) '(1 1)]) (define pd-game (make-game pd-actions pd-utility)) (define/match (pd-utility2 a b) [('C 'C) '(5 5)] [('D 'C) '(2 0)] [('C 'D) '(0 2)] [('D 'D) '(1 1)]) (define pd-game2 (make-game pd-actions pd-utility2)) ;;; GAMES, NORMAL AND MORAL ;; Normal games are run by instantiating bandits to choose actions ;; directly. These then converge to an equilibrium of some sort (define (run-game gm #:steps [steps 250]) (match-define (game actions utilities) gm) (define bandits (map (curry make-bandit steps) actions)) (for/list ([step (in-range steps)]) (define choices (map bandit-choose bandits)) (define rewards (map (λ (f) (f choices)) utilities)) (for-each bandit-update! bandits rewards) (map list choices))) ;; A moral game has multiple plausible utilities, including a ;; distinguished "original" utility. It instantiates several agents, ;; one per plausible utility, for each player, and also a bandit to ;; choose among these players. (define pd-util+ (list (list (curry apply (compose first pd-utility2)) (curry apply (compose second pd-utility2))))) (define (run-moral-game gm utils+ #:steps [steps 250]) (match-define (game actions utilities) gm) (define agents ; One agent per possible utility function (map (curry map make-agent actions) (apply map list (cons utilities utils+)))) (define bandits (map (curry make-bandit steps) agents)) (for/list ([step (in-range steps)]) (define subagents (map bandit-choose bandits)) (define idxs (for/list ([bandit bandits] [subagent subagents]) (index-of (bandit-labels bandit) subagent))) (define choices (map agent-choose subagents)) (define rewards (map (λ (f) (f choices)) utilities)) (for-each bandit-update! bandits rewards) (for ([bandit bandits] [subagent subagents] [reward rewards] [i (in-naturals)]) (define prob (vector-ref (bandit-probs bandit) (index-of (bandit-labels bandit) subagent))) (agent-update! subagent (/ *rate* prob) i choices)) (map list choices idxs))) ;; This simple function plots the behavior of the agents (define (plot-results gm dones tmpl) (define actionss (for/list ([_ (first dones)] [i (in-naturals)]) (sort (remove-duplicates (append-map (curryr list-ref i) dones)) string