mratsim · mratsim · Dec 15, 2019 · Dec 15, 2019 · Dec 15, 2019 · Dec 15, 2019
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,73 @@
+# Travis config for Synthesis
+language: c
+
+cache:
+  directories:
+    - nim-devel
+    - nim-stable
+
+matrix:
+  include:
+    # Weave only works with Nim devel
+    # Build and test using both gcc and clang
+    # Build and test on both x86-64 and ARM64
+    - os: linux
+      env: CHANNEL=devel
+      compiler: gcc
+
+    - os: linux
+      arch: arm64
+      env: CHANNEL=devel
+      compiler: gcc
+
+    - os: linux
+      env: CHANNEL=devel
+      compiler: clang
+
+    # On OSX we only test against clang (gcc is mapped to clang by default)
+    - os: osx
+      env: CHANNEL=devel
+      compiler: clang
+  fast_finish: true
+
+before_install:
+  - |
+    if [ "${CHANNEL}" = stable ]; then
+      BRANCH="v$(curl https://nim-lang.org/channels/stable)"
+    else
+      BRANCH="${CHANNEL}"
+    fi
+
+install:
+  # Detect caching of Nim compiler
+  - |
+    if [ ! -x "nim-${CHANNEL}/bin/nim" ]; then
+      git clone -b "${BRANCH}" https://github.com/nim-lang/nim "nim-${CHANNEL}/"
+      pushd "nim-${CHANNEL}"
+      git clone --depth 1 https://github.com/nim-lang/csources csources/
+      pushd csources
+      sh build.sh
+      popd
+      rm -rf csources
+      bin/nim c koch
+      ./koch boot -d:release
+      ./koch tools
+    else
+      pushd "nim-${CHANNEL}"
+      git fetch origin "${BRANCH}"
+      if ! git merge FETCH_HEAD | grep "Already up.to.date"; then
+        bin/nim c koch
+        ./koch boot -d:release
+        ./koch tools
+      fi
+    fi
+    popd
+before_script:
+    - export PATH="nim-${CHANNEL}/bin${PATH:+:$PATH}"
+script:
+    - nimble refresh
+    - nimble install cligen
+    - nimble test
+branches:
+  except:
+    - gh-pages
diff --git a/LICENSE-APACHEv2 b/LICENSE-APACHEv2
@@ -1,4 +1,4 @@
-Picasso is licensed under the Apache License version 2
+Weave is licensed under the Apache License version 2
 Copyright (c) 2019 Mamy André-Ratsimbazafy
 ------------------------------------------------------
 

diff --git a/LICENSE-MIT b/LICENSE-MIT
@@ -1,4 +1,4 @@
-Picasso is licensed under the MIT License
+Weave is licensed under the MIT License
 Copyright (c) 2019 Mamy André-Ratsimbazafy
 -----------------------------------------------------
 

diff --git a/README.md b/README.md
@@ -1,13 +1,19 @@
 # Weave, a state-of-the-art multithreading runtime
+[![Build Status: Travis](https://img.shields.io/travis/com/mratsim/weave?label=Travis%20%28Linux%2FMac%20-%20x86_64%2FARM64%29)](https://travis-ci.com/mratsim/weave)
+[![License: Apache](https://img.shields.io/badge/License-Apache%202.0-blue.svg)](https://opensource.org/licenses/Apache-2.0)
+[![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](https://opensource.org/licenses/MIT)
+![Stability: experimental](https://img.shields.io/badge/stability-experimental-orange.svg)
 
 _"Good artists borrow, great artists steal."_ -- Pablo Picasso
 
-Weave (codenamed "Project Picasso") is a multithreading runtime for the [Nim programming language](https://nim-lang.org/)
+Weave (codenamed "Project Picasso") is a multithreading runtime for the [Nim programming language](https://nim-lang.org/).
+
+⚠️ At the moment, Weave only works on Linux and MacOS. The only missing part for Windows
+is wrapping [Synchronization Barriers](https://docs.microsoft.com/en-us/windows/win32/sync/synchronization-barriers) (which is much better than MacOS where you have to write the barrier from scratch).
 
 > ⚠️ Disclaimer:
 >
-> Weave currently has no test suite or continuous integration.
-> In particular, the synchronization primitives were not formally verified
+> The synchronization primitives were not formally verified
 > or model-checked to prove the absence of data races or deadlock/livelock,
 > nor were they passed under a data race detection tool.
 >
@@ -18,6 +24,201 @@ Weave (codenamed "Project Picasso") is a multithreading runtime for the [Nim pro
 > Weave does limit synchronization to only simple SPSC and MPSC channels which greatly reduces
 > the potential bug surface.
 
+## API
+
+### Task parallelism
+
+Weave provides a simple API based on spawn/sync which works like async/await for IO-based futures.
+
+The traditional parallel recursive fibonacci would be written like this:
+```Nim
+import weave
+
+proc fib(n: int): int =
+  # int64 on x86-64
+  if n < 2:
+    return n
+
+  let x = spawn fib(n-1)
+  let y = fib(n-2)
+
+  result = sync(x) + y
+
+proc main() =
+  var n = 20
+
+  init(Weave)
+  let f = fib(n)
+  exit(Weave)
+
+  echo f
+
+main()
+```
+
+### Data parallelism
+
+Weave provides nestable parallel for loop.
+
+A nested matrix transposition would be written like this:
+
+```Nim
+import weave
+
+func initialize(buffer: ptr UncheckedArray[float32], len: int) =
+  for i in 0 ..< len:
+    buffer[i] = i.float32
+
+proc transpose(M, N: int, bufIn, bufOut: ptr UncheckedArray[float32]) =
+  ## Transpose a MxN matrix into a NxM matrix with nested for loops
+
+  parallelFor j in 0 ..< N:
+    captures: {M, N, bufIn, bufOut}
+    parallelFor i in 0 ..< M:
+      captures: {j, M, N, bufIn, bufOut}
+      bufOut[j*M+i] = bufIn[i*N+j]
+
+proc main() =
+  let M = 200
+  let N = 2000
+
+  let input = newSeq[float32](M*N)
+  # We can't work with seq directly as it's managed by GC, take a ptr to the buffer.
+  let bufIn = cast[ptr UncheckedArray[float32]](input[0].unsafeAddr)
+  bufIn.initialize(M*N)
+
+  var output = newSeq[float32](N*M)
+  let bufOut = cast[ptr UncheckedArray[float32]](output[0].addr)
+
+  transpose(M, N, bufIn, bufOut)
+
+main()
+```
+
+### Strided loops
+
+You might want to use loops with a non unit-stride, this can be done with the following syntax.
+
+```Nim
+init(Weave)
+
+# expandMacros:
+parallelForStrided i in 0 ..< 100, stride = 30:
+  parallelForStrided j in 0 ..< 200, stride = 60:
+    captures: {i}
+    log("Matrix[%d, %d] (thread %d)\n", i, j, myID())
+
+exit(Weave)
+```
+
+### Complete list
+
+- `init(Weave)`, `exit(Weave)` to start and stop the runtime. Forgetting this will give you nil pointer exceptions on spawn.
+- `spawn fnCall(args)` which spawns a function that may run on another thread and gives you an awaitable Flowvar handle.
+- `sync(Flowvar)` will await a Flowvar and block until you receive a result.
+- `sync(Weave)` is a global barrier for the main thread on the main task. Allowing nestable barriers for any thread is work-in-progress.
+- `parallelFor`, `parallelForStrided`, `parallelForStaged`, `parallelForStagedtrided` are described above and in the experimental section.
+- `loadBalance(Weave)` gives the runtime the opportunity to distribute work. Insert this within long computation as due to Weave design, it's busy workers hat are also in charge of load balancing. This is done automatically when using `parallelFor`.
+- `isSpawned` allows you to build speculative algorithm where a thread is spawned only if certain conditions are valid. See the `nqueens` benchmark for an example.
+- `getThreadId` returns a unique thread ID. The thread ID is in the range 0 ..< number of threads.
+
+The max number of threads can be configured by the environment variable WEAVE_NUM_THREADS
+and default to your number of logical cores (including HyperThreading).
+Weave uses Nim's `countProcessors()` in `std/cpuinfo`
+
+## Table of Contents
+
+- [Weave, a state-of-the-art multithreading runtime](#weave-a-state-of-the-art-multithreading-runtime)
+  - [API](#api)
+    - [Task parallelism](#task-parallelism)
+    - [Data parallelism](#data-parallelism)
+    - [Strided loops](#strided-loops)
+    - [Complete list](#complete-list)
+  - [Table of Contents](#table-of-contents)
+  - [Experimental features](#experimental-features)
+    - [Parallel For Staged](#parallel-for-staged)
+    - [Lazy Allocation of Flowvars](#lazy-allocation-of-flowvars)
+    - [Backoff mechanism](#backoff-mechanism)
+  - [Limitations](#limitations)
+  - [Statistics](#statistics)
+  - [Tuning](#tuning)
+  - [Unique features](#unique-features)
+  - [Research](#research)
+  - [License](#license)
+
+## Experimental features
+
+### Parallel For Staged
+
+Weave provides a `parallelForStaged` construct with supports for thread-local prologue and epilogue.
+
+A parallel sum would look like this:
+```Nim
+proc sumReduce(n: int): int =
+  let res = result.addr
+  parallelForStaged i in 0 .. n:
+    captures: {res}
+    prologue:
+      var localSum = 0
+    loop:
+      localSum += i
+    epilogue:
+      echo "Thread ", getThreadID(Weave), ": localsum = ", localSum
+      res[].atomicInc(localSum)
+
+  sync(Weave)
+
+init(Weave)
+let sum1M = sumReduce(1000000)
+echo "Sum reduce(0..1000000): ", sum1M
+doAssert sum1M == 500_000_500_000
+exit(Weave)
+```
+
+### Lazy Allocation of Flowvars
+
+Flowvars can be lazily allocated, this reduces overhead by at least 2x on very fine-grained tasks like Fibonacci or Depth-First-Search that may spawn trillions on tasks in less than
+a couple hundreds of milliseconds. This can be enabled with `-d:WV_LazyFlowvar`.
+
+⚠️ This only works for Flowvar of a size up to your machine word size (int64, float64, pointer on 64-bit machines)
+
+### Backoff mechanism
+
+A Backoff mechanism is available for preview, that allow workers with no tasks to sleep instead of spining aimlessly and burning CPU.
+
+This can be enabled with `-d:WV_EnableBackoff=on`.
+It will become the default in the future.
+
+⚠️ The backoff mechanism is currently prone to deadlocks where a worker sleeps
+and never replies anymore leaving the other workers hanging.
+
+## Limitations
+
+Weave cannot work with GC-ed types. Pass a pointer around or use Nim channels which are GC-aware.
+This might improve with Nim ARC/newruntime.
+
+## Statistics
+
+Curious minds can acces the low-level runtime statistic with the flag `-d:WV_metrics`
+which will give you the information on number of tasks executed, steal requests sent, etc.
+
+Very curious minds can also enable high resolution timers with `-d:WV_metrics -d:WV_profile -d:CpuFreqMhz=3000` assuming you have a 3GHz CPU.
+
+The timers will give you in this order:
+```
+Time spent running tasks, Time spent recv/send steal requests, Time spent recv/send tasks, Time spent caching tasks, Time spent idle, Total
+```
+
+## Tuning
+
+A number of configuration options are available in [weave/config.nim](weave/config.nim).
+
+In particular:
+- `-d:WV_StealAdaptativeInterval=25` defines the number of steal requests after which thieves reevaluate their steal strategy (steal one task or steal half the victim's tasks). Default: 25
+- `-d:WV_StealEarly=0` allows worker to steal early, when only `WV_StealEraly tasks are leftin their queue. Default: don't steal early
+
+## Unique features
+
 Weave provides an unique scheduler with the following properties:
 - Message-Passing based:
   unlike alternative work-stealing schedulers, this means that Weave is usable
@@ -58,9 +259,19 @@ Weave provides an unique scheduler with the following properties:
 The "Project Picasso" RFC is available for discussion in [Nim RFC #160](https://github.com/nim-lang/RFCs/issues/160)
 or in the (potentially outdated) [picasso_RFC.md](Weave_RFC.md) file
 
+## Research
+
 Weave is based on the research by [Andreas Prell](https://github.com/aprell/).
 You can read his [PhD Thesis](https://epub.uni-bayreuth.de/2990) or access his [C implementation](https://github.com/aprell/tasking-2.0).
 
+Several enhancements were built into Weave, in particular:
+
+- Memory management was carefully studied to allow releasing memory to the OS
+  while still providing very high performance and solving the decades old cactus stack problem.
+  The solution, coupling a threadsafe memory pool with a lookaside buffer, is
+  inspired by Microsoft's Mimalloc and Snmalloc, a message-passing based allocator (also by Microsoft). Details are provided in the multiple Markdown file in the [memory folder](weave/memory).
+- The channels were reworked to not use locks. In particular the MPSC channel (Multi-Producer Single-Consumer) supports batching for both producers and consumers without any lock.
+
 ## License
 
 Licensed and distributed under either of

diff --git a/benchmarks/matrix_transposition/weave_transposes.nim b/benchmarks/matrix_transposition/weave_transposes.nim
@@ -250,7 +250,7 @@ template runBench(transposeName: typed, reorderCompute, isSequential: bool): unt
 # Interface
 # ---------------------------------------------------
 
-proc main(M = 400, N = 4000, nrounds = 10000, transposeStrat = TiledNested, reorderCompute=false) =
+proc main(M = 400, N = 4000, nrounds = 1000, transposeStrat = TiledNested, reorderCompute=false) =
   echo "Inverting the transpose order may favor one transposition heavily for non-tiled strategies"
 
   let isSequential = transposeStrat == Sequential