-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsearch.html
More file actions
373 lines (343 loc) · 250 KB
/
search.html
File metadata and controls
373 lines (343 loc) · 250 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="description" content="thoughts, musings, writings">
<meta name="author" content="Bharadwaj">
<title>Search - Bharadwaj's Blog</title>
<!-- RSS Feed -->
<link rel="alternate" type="application/rss+xml" href="https://bharath12345.github.io/feeds/all.atom.xml" title="Bharadwaj's Blog RSS Feed">
<!-- Bootstrap 5 CSS -->
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
<!-- Google Fonts -->
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600&display=swap" rel="stylesheet">
<!-- Custom CSS -->
<link rel="stylesheet" href="https://bharath12345.github.io/theme/css/custom.css">
<!-- Favicon -->
<link rel="shortcut icon" href="https://bharath12345.github.io/images/favicon.ico">
</head>
<body>
<!-- Navigation -->
<nav class="navbar navbar-expand-lg navbar-dark fixed-top" style="background-color: #483d8b;">
<div class="container-fluid">
<a class="navbar-brand" href="https://bharath12345.github.io/">Bharadwaj's Blog</a>
<button class="navbar-toggler" type="button" data-bs-toggle="collapse" data-bs-target="#navbarNav">
<span class="navbar-toggler-icon"></span>
</button>
<div class="collapse navbar-collapse" id="navbarNav">
<ul class="navbar-nav me-auto">
<li class="nav-item">
<a class="nav-link" href="https://bharath12345.github.io/categories.html">Categories</a>
</li>
<li class="nav-item">
<a class="nav-link" href="https://bharath12345.github.io/pages/toc/">Index</a>
</li>
<li class="nav-item">
<a class="nav-link" href="https://bharath12345.github.io/search.html">Search</a>
</li>
</ul>
<ul class="navbar-nav ms-auto">
<li class="nav-item">
<a class="nav-link" href="https://bharath12345.github.io/pages/about/movies/" title="About me">
<svg width="16" height="16" fill="currentColor" viewBox="0 0 16 16" style="vertical-align: -0.125em;">
<path d="M8 8a3 3 0 1 0 0-6 3 3 0 0 0 0 6zm2-3a2 2 0 1 1-4 0 2 2 0 0 1 4 0zm4 8c0 1-1 1-1 1H3s-1 0-1-1 1-4 6-4 6 3 6 4zm-1-.004c-.001-.246-.154-.986-.832-1.664C11.516 10.68 10.289 10 8 10c-2.29 0-3.516.68-4.168 1.332-.678.678-.83 1.418-.832 1.664h10z"/>
</svg>
<span class="d-lg-none">About me</span>
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="https://github.com/bharath12345" title="Projects">
<svg width="16" height="16" fill="currentColor" viewBox="0 0 16 16" style="vertical-align: -0.125em;">
<path d="M8 0C3.58 0 0 3.58 0 8c0 3.54 2.29 6.53 5.47 7.59.4.07.55-.17.55-.38 0-.19-.01-.82-.01-1.49-2.01.37-2.53-.49-2.69-.94-.09-.23-.48-.94-.82-1.13-.28-.15-.68-.52-.01-.53.63-.01 1.08.58 1.23.82.72 1.21 1.87.87 2.33.66.07-.52.28-.87.51-1.07-1.78-.2-3.64-.89-3.64-3.95 0-.87.31-1.59.82-2.15-.08-.2-.36-1.02.08-2.12 0 0 .67-.21 2.2.82.64-.18 1.32-.27 2-.27.68 0 1.36.09 2 .27 1.53-1.04 2.2-.82 2.2-.82.44 1.1.16 1.92.08 2.12.51.56.82 1.27.82 2.15 0 3.07-1.87 3.75-3.65 3.95.29.25.54.73.54 1.48 0 1.07-.01 1.93-.01 2.2 0 .21.15.46.55.38A8.012 8.012 0 0 0 16 8c0-4.42-3.58-8-8-8z"/>
</svg>
<span class="d-lg-none">GitHub Projects</span>
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="http://www.linkedin.com/in/bharadwajn" title="CV">
<svg width="16" height="16" fill="currentColor" viewBox="0 0 16 16" style="vertical-align: -0.125em;">
<path d="M0 1.146C0 .513.526 0 1.175 0h13.65C15.474 0 16 .513 16 1.146v13.708c0 .633-.526 1.146-1.175 1.146H1.175C.526 16 0 15.487 0 14.854V1.146zm4.943 12.248V6.169H2.542v7.225h2.401zm-1.2-8.212c.837 0 1.358-.554 1.358-1.248-.015-.709-.52-1.248-1.342-1.248-.822 0-1.359.54-1.359 1.248 0 .694.521 1.248 1.327 1.248h.016zm4.908 8.212V9.359c0-.216.016-.432.08-.586.173-.431.568-.878 1.232-.878.869 0 1.216.662 1.216 1.634v3.865h2.401V9.25c0-2.22-1.184-3.252-2.764-3.252-1.274 0-1.845.7-2.165 1.193v.025h-.016a5.54 5.54 0 0 1 .016-.025V6.169h-2.4c.03.678 0 7.225 0 7.225h2.4z"/>
</svg>
<span class="d-lg-none">CV</span>
</a>
</li>
<li class="nav-item">
<a class="nav-link" href="http://www.youtube.com/user/bharadwajnarasimha" title="Videos">
<svg width="16" height="16" fill="currentColor" viewBox="0 0 16 16" style="vertical-align: -0.125em;">
<path d="M8.051 1.999h.089c.822.003 4.987.033 6.11.335a2.01 2.01 0 0 1 1.415 1.42c.101.38.172.883.22 1.402l.01.104.022.26.008.104c.065.914.073 1.77.074 1.957v.075c-.001.194-.01 1.108-.082 2.06l-.008.105-.009.104c-.05.572-.124 1.14-.235 1.558a2.007 2.007 0 0 1-1.415 1.42c-1.16.312-5.569.334-6.18.335h-.142c-.309 0-1.587-.006-2.927-.052l-.17-.006-.087-.004-.171-.007-.171-.007c-1.11-.049-2.167-.128-2.654-.26a2.007 2.007 0 0 1-1.415-1.419c-.111-.417-.185-.986-.235-1.558L.09 9.82l-.008-.104A31.4 31.4 0 0 1 0 7.68v-.123c.002-.215.01-.958.064-1.778l.007-.103.003-.052.008-.104.022-.26.01-.104c.048-.519.119-1.023.22-1.402a2.007 2.007 0 0 1 1.415-1.42c.487-.13 1.544-.21 2.654-.26l.17-.007.172-.006.086-.003.171-.007A99.788 99.788 0 0 1 7.858 2h.193zM6.4 5.209v4.818l4.157-2.408L6.4 5.209z"/>
</svg>
<span class="d-lg-none">YouTube Videos</span>
</a>
</li>
</ul>
</div>
</div>
</nav>
<!-- Secondary Navigation (for About pages, etc.) -->
<!-- Main Content -->
<div class="container main-content">
<section class="content">
<h2>Search</h2>
<hr>
<div class="search-box mb-4">
<input type="text" id="search-input" class="form-control form-control-lg" placeholder="Search articles...">
</div>
<div id="search-results">
<p class="text-muted">Enter a search term to find articles.</p>
</div>
</section>
</div>
<!-- Footer -->
<footer class="footer mt-5 py-3 bg-light">
<div class="container text-center">
<p class="text-muted">Powered by Python, Pelican, Bootstrap and GitHub Pages!</p>
</div>
</footer>
<!-- Bootstrap 5 JS Bundle -->
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
<!-- Lazy load images -->
<script>
document.addEventListener('DOMContentLoaded', function() {
// Add lazy loading to all images in article content
document.querySelectorAll('article img, .content img').forEach(function(img) {
if (!img.hasAttribute('loading')) {
img.setAttribute('loading', 'lazy');
}
});
});
</script>
<script type="text/javascript">
// Search index - all articles
var searchIndex = [
{
title: "Application Developer View: PostgreSQL vs. MySQL",
url: "https://bharath12345.github.io/posts/application-developer-view-postgresql-vs-mysql/",
date: "Wed 01 January 2020",
category: "posts",
content: "I reluctantly started to write this post some 6 months ago. As a application developer my knowledge of the internals of DBMS design was (and still is) very limited. It is one thing to work with a DBMS at development and quite another to keep it running as part of IT Operations. My motivation here is to share a few specific ideas with fellow application developers. The attempt is to do a value judgement of the two systems from a development standpoint and steer clear from a value judgement in the deployed scenario. After all DBMS systems are probably at the heart of more Aps vs. Ops debates than anything else. Table of Contents The \u0027Null\u0027 Problem Object Relational Database System! Choice Of Data Types and Storage Performance Philosophical difference that influences technology Epilogue Quoting from the references There is no reason at all to use MySQL: MariaDB, MySQL founder Michael Widenius Now, to say it simply (at the cost of barbs from some of my good friends who I know to be excellent operations engineers for MySQL) - PostgreSQL leads MySQL. And by some distance. Apart from reading about the internals and playing with both systems I felt a need speak to whomever I could in the developer community to ask for the reasons behind the choice of DBMS in their projects. In the last 6 months I could speak to just about eight such people in different projects. Almost all from medium to small companies doing web applications (but some of these projects were themselves quite large). After speaking to these people there is one thing that I cannot but share - the answer to Why MySQL from all who had chosen it was - \"Unfortunately, MySQL had already been chosen by the time I got involved\". Of the eight, six had been running projects for 2-3 years of which three had chosen MySQL. Rest had all opted for PostgreSQL. When I told a colleague of writing this article he smiled and asked a polite, Why? After all, the web is filled with such articles. Mostly written by expert database admins. There are fewer articles from the application programmer point-of-view. I can think of two reasons why there are not many programmers dissecting this - Developers find it difficult to talk on this topic in which the Operations folk have strong opinions. In many projects of the DevOps kind the decision to pick the database is the prerogative of the Operations folk than the Developer folk From a developer perspective, the PostgreSQL vs. MySQL debate is a non-starter. PostgreSQL wins. And wins quite early (you will know the why by the end of this post) But before delving deeper into the comparison its good to set the application context - Enterprise Applications. By this, I mean the application has more moving parts than a typical web-stack. The number of tables could stretch into hundreds. Data is collected from myriad sources in real-time Read-write ratio varies vastly across tables. Database needs to support 90% (and upwards) read-only tables and also tables with much higher write than read, say 60% (and upwards) Many thousand transactions per second Hundreds of stored procedures Automating migrations, upgrades and sharding Given that the topic is vast and both softwares are widely used its probably a good idea to start by pointing to some of the good references for comparison from the wild web - MySQL vs. PostgreSQL - recent and continuously updated. Readers would do well to read the articles in the links section (on last read, I did not find a single article talking glowingly about MySQL in comparison to PostgreSQL) Couple of very good articles comparing these two by Robert Haas Table Organization Vacuum vs. Purge PostgreSQL vs MySQL: Which is better? - This article is 10 years old. Still a good read MySQL Gotchas and PostgreSQL Gotchas. Just stare at the size of these two lists for some time even if you don\u0027t read them. They tell a story Comparing Reliability and Speed A Comparison of Enterprise Suitability - PostgreSQL is Suited Better - though MyISAM focused, this comparison is with enterprise products in purview and is 5 years old (2008). Since then, the gap between PostgreSQL and MySQL have only widened in favour of PostgreSQL despite InnoDB I plan and hope not to repeat anything that is already said in these articles. And agreeing with the many writers of these articles, I don\u0027t see any point doing performance benchmark comparisons between these two database systems. But I do want to point the interested readers to the political aspects in this comparison (I have quoted from this interview at the end of this article). MySQL has been acquired by Oracle. Its only natural to have concerns about the future roadmap of MySQL given these concerns which affect technology deeply... Moving on to the specifics... The \u0027Null\u0027 Problem The biggest accusation one can make against any RDBMS is that it is not careful with data integrity. MySQL is notorious for its inability to handle Null with many data types. Effort to accommodate query mistakes ruins MySQL. For example - MySQL will insert empty strings for text fields that have not-null constraint. This happens if you forgot to mention a field during the insert or if you somehow ended up inserting a blank value (\u0027\u0027) for a field. It goes ahead with the insert in both these cases. Irrespective of weather we use ORM or direct JDBC or some other kind of wrappers, there simply is no way to gracefully handle this problem. PostgreSQL won\u0027t do such a thing Non-null timestamps end up getting all zero value dates. If you push a NULL as date, it defaults to current time! With decimal numbers, if you are not careful with precision and scale, then, on an insert MySQL will change the data to fit the column constraints. Of course its necessary to be careful when playing with data but the problem here is a change in precision (column constraint) should in no way change the data as MySQL does. This kind of problem is just plain horror. Just refer to the MySQL gotchas site to get a clear understanding of this problem. Postgres does not alter data no matter what While writing functions, MySQL does not throw graceful exceptions for divide by zero. It just returns a plain NULL all the time! In MySQL set a text field length to X and insert a string which is 2X in length... MySQL will just promptly truncate the extra X. Now, for gods sake - the length X was a constraint. On trying to insert longer length strings, we expect MySQL to throw errors... not play with our data... MySQL has no idea about dates. Try inserting 31st Feb and it will promptly comply inserting crap MySQL will allow inserting of strings to decimal columns, sometimes storing it as 0 and sometimes as NULL These problems are by no means all that is there to be said about MySQL\u0027s SQL compliance. MySQL takes liberties to not abide by user supplied constraints in many more situations. And this aspects creates massive problems for developers on both correctness and performance fronts. Object Relational Database System! PostgreSQL calls itself Object Relational Database System. This is so because it brings with itself many new ideas that lend very well with the OOPS modelled world (that developers are so used to). And this paradigm fits the enterprise data models and requirements quite well. Let me state three specific features - Logical Partitioning Windowing Functions Table Inheritance Each of these features can be quite critical with the ever increasing data that needs to be handled in today\u0027s world. It takes some reading to understand each one but it is well worth the effort. On the other side I fail to find any feature that MySQL brings that may be absent from PostgreSQL (think about it - thats a very big assertion I make!). To illustrate the point further let me describe one of my favourite features - table inheritance with an example. The below statements create tables where the column name belongs to the base table (shape) and columns like edge, radius belong to the derived tables. This model closely resembles how data is modelled in OOPS. Running the above SQL statements, will result in following status in different tables - shape - 4 records square, circle, rectangle tables - 1 record each! SQL Statements - CREATE TABLE shape ( name varchar(50) ); CREATE TABLE square (edge int) INHERITS (shape); CREATE TABLE circle (radius int) INHERITS (shape); CREATE TABLE rectangle (w int, h int) INHERITS (shape); INSERT into shape (name) VALUES (\u0027random\u0027) INSERT into square (name, edge) VALUES (\u0027square\u0027, 10); INSERT into circle (name, radius) VALUES (\u0027circle\u0027, 10); INSERT into rectangle (name, w, h) VALUES (\u0027rectangle\u0027, 5, 10); Like \u0027INHERITS\u0027 there also is a \u0027NO INHERITS\u0027 to mixin different tables with precision. And more importantly, Postgres uses partitioning under the covers to enable inheritance. So, not only does inheritance give the programmer flexibility in data modelling lending but it also leads to lesser duplication, and thus helps improve performance! Without inheritance, the engineers will be forced to do multiple table joins and filters (many times going up to boolean value marker columns) - which sounds over-engineering for a OOP developer standpoint. Thinking about it, the non-object oriented SQL design adds to overhead to SQL optimiser, makes indexing overhead higher and many more such misses. Choice Of Data Types and Storage MySQL has far fewer data types than PostgreSQL. Adding new data types to MySQL is a non-trivial error-prone work even for experience professionals. Compared to this, PostgreSQL offers a proverbial goldmine of data-types for designers to choose from. Here are some aspects about data-types that really makes PostgreSQL standout vis-a-vis MySQL - Data types for Dates - A massive choice to choose from for specific usecases Data types for IPv4, IPv6, MAC, Inet address Data types for Arrays, JSON, UUID, XML with features like search within Arrays using indexes and where clauses Data types for floating point numerics - rounding errors can be eliminated to a much larger extent with the massive choice available in this area Infinity, -Infinity, NaN as values for numeric data types - in MySQL one has no way of modelling these. Modelling these as nulls often leads to programming complexity and errors ORM tools often convert \u0027String\u0027 datatype to nvarchar(max) which kills performance on MySQL. Inserting multibyte characters (say Japanese) into varchar fields completely corrupts data (no database exception thrown!). Sometimes it is not sufficient to just change the column type to nvarchar when trying to store multibyte characters. Even the insert statements need a prefix (application level code change if you are using JDBC). PostgreSQL uses default UTF8 encoding. There is no varchar/nvarchar problems. Everything simply works! Adding constraints to complex types likes dates is made extremely simple with embedded functions. No such thing possible in MySQL. Special keywords like \u0027today\u0027, \u0027tomorrow\u0027, \u0027yesterday\u0027, \u0027allballs\u0027 etc lend readability to the code All strings are default UTF-8 encoded Serial and other sequences - leads to very fast ID key finding and incrementing Data type for Money! Index even functions (no other DB does this) Automatic Data Compress by Default Why are data-types important? Modelling precisely leads to less data stored. When performance becomes important to squeeze out the max performance requires optimised storage... because finally, things in DB schema are going to end up in RAM caches and larger datatypes will mean more space being taken up on the RAM. Less conservatively used RAM cache will bring down the performance of the application more than anything else. Performance Comparing performance of PostgreSQL and MySQL (InnoDB) is a loaded question. The references I have spelt out earlier have links to many scholarly articles that articulate the subtle differences in the MVCC implementation of both. Both provide row locking, page locking, along with read/write lock separation. After digging into the details picking one of these two on the basis of performance comes back to the nature of the application that is being built. Designers should pay attention to three critical questions and answer them sufficiently before making a choice - Read/Write characteristics of the application Concurrent access characteristics of various tables Cost of dirty reads, non-repeatable reads, phantom reads etc These are not easy questions to answer. The performance area is complex enough and if concurrent writes requirements of an application are extreme then moving away totally from SQL to NoSQL is a better option than trying to split hairs over RDBMS engines. A move to NoSQL brings massive freedom to design around write and concurrent access problems (along with massive responsibility to handle things correctly!). So, choosing MySQL over PostgreSQL due to some notions of higher performance without concrete answers to the above posers, in all probability, will lead to a disaster-in-waiting. Philosophical difference that influences technology Some experts have pointed out a subtle but important philosophical difference between MySQL and PostgreSQL that impacts their core technological offering. MySQL is a product while PostgreSQL is a project. MySQL has been a product since its inception and sold multiple times over by different companies that have owned it. Due to the product definition and ownership, large scale code corrections have been fewer with MySQL. This philosophical difference is what is behind the fact that MySQL is still in v5.x while PostgreSQL in v9.x. This difference also leads to a design where MySQL separates the storage engine and SQL parsing as different (and many different storage engines can be chosen). While PostgreSQL integrates the whole stack top-to-bottom. The folks behind PostgreSQL are driven to bring the progress in database technology to the fingertips of developers and admins. Thats why PostgreSQL has made larger course corrections in its evolutions (lending to a bigger version number v9). Epilogue I have a hypothesis. MySQL is more popular in applications developed using Ruby, PHP, Perl or Python. Just like Microsoft\u0027s SQL-Server is the default database if you are a C# application. This is so because of the community and peer group effect. And also because there are many tools and expertise within the ecosystem if you choose a popular stack. But the most popular language to develop enterprise applications is Java. And I personally get more fond of Scala by every passing day. So the hypothesis is, for JVM developers MySQL does not lend well just because of the community/peer-group effect. So the choice needs to be based more on technological pro\u0027s and con\u0027s. Quoting from the references There is no reason at all to use MySQL: MariaDB, MySQL founder Michael Widenius What Oracle is doing wrong (visit the website to find the reference for each point) New \u2018enterprise\u2019 extensions in MySQL are closed source The bugs database is not public anymore The MySQL public repositories are not anymore actively updated. Security problems are not communicated nor addressed quickly (This is making Linux distributions very annoyed with Oracle) Instead of fixing bugs, Oracle is removing features: New code in MySQL 5.5 doesn\u2019t have test cases anymore. Some of the new code is surprisingly good by Oracle, but unfortunately the quality varies and a notable part needs to be rewritten before we can include it in MariaDB And, probably worst of all, it\u2019s impossible for the community to work with the MySQL developers at Oracle. Oracle doesn\u2019t accept patches There is no public roadmap There is no way to discuss with MySQL developers how to implement things or how the current code works"
}, {
title: "Functional Conference: Random Notes...",
url: "https://bharath12345.github.io/posts/functional-conference-random-notes/",
date: "Fri 17 October 2014",
category: "posts",
content: "The first \u0027Functional Conference\u0027 happened in Bangalore between Oct 9-11. I had been keenly looking forward to it. This is a quick post on the sessions I attended and the conference itself. As the lineup of speakers and topics shaped up in the buildup to the conference on their website, it heightened my expectations. As a younger engineer I have gone through the cycle of expecting too much from conferences and thus not being able to learn sufficiently from that which was on offer. Time has had a mellowing effect... I find it much better to keep an open mind and try to absorb all that is on offer. And then, a little later, retain only that which is useful/pertinent. With that mindset and approach I found \u0027Functional Conference\u0027 a very fulfilling technology experience - plenty of technical richness to absorb and sufficient ideas to retain for long. Day 1, Session 1: The Keynote, by Venkat Subramaniam Venkat is as fabulous a speaker/presenter as he is writer/thinker. The theme of his keynote was an elaboration on the idea of mainstream. Why did it take many centuries for heliocentricity to gain acceptance over the mainstream idea of geocentricity? Why did it take many centuries for well meaning doctors to accept the existence microbial germs as the cause of diseases over other widely held mainstream theories? Mainstream in the world of programming is OOP in the style of Java and C++. They may not be false idol\u0027s after all. However that non-mainstream is generally not even introduced in colleges and software engineers have proceeded to long careers without even basic understanding of other programming approaches is sad indeed. Venkat drew the attention of the audience that things were nevertheless changing. Maybe it took a long incubation for the geocentric idea to gain... but once the right ideas, even if non-mainstream, gain a foothold, there is no turning back. Maybe functional programming has had a 80 year incubation! After all it took 22 years for even OOPS to become mainstream. But things are changing (lambdas in java!) and will never be the same again! Two answers by Venkat in the post-session stuck a chord with me. The first was a question on his favourite language... after all he had written books and applications in so many! Venkat responded by saying he treated languages as tools, say like vehicles. we sometime use a car and sometimes a flight, don\u0027t we!? So no favorites. The second was actually a counter-question by Venkat - Do languages shape new thought or new thoughts shape languages? There is enough material in terms of academic research to tell us that this stream runs both ways! Day 1, Session 2: Haskell for everyday programmers, by Venkat Subramaniam I was split between going for the Haskell session or the parallel Elm session. Since my work has been more and more away from UI, I chose Haskell. However, later on, heard great feedback on the Elm session by other folks at the conf. Now waiting for the slides of that session to be up to check it out. The Haskell session was a runaway hit with Venkat giving a quick intro of the many aspects of the language using the ghci REPL. The key ideas learn/relearnt were: Polymorphic Types Functional Purity (no haskell speaker can never not mention this!): functions cannot have side effect. and purity always means thread-safety! Memoization: the massive performance gain that could comes ones way due to functional purity Order of program evaluation: Normative vs. Applicative Expressions vs. Statements: statements promote mutability that one cannot escape. expressions do the opposite Day 1, Session 3: Functional programming in large scale data processing, by Kishore Nallan My day job is programming in Scala to build a large scale data processing platform. So choosing this session from a fellow traveler was natural. Kishore described the journey at Indix to build a web-scale product catalog by crawling and indexing the internet. The story behind their adoption of the Lambda Architecture as propounded by Nathan Marz. Kishore spoke of the benefits of using a log-structured database as first port of store than a continuously mutating RDBMS or column store. Indix is a big Hadoop shop with continuous jobs to persist data, aggregate it and run both ritual/ad-hoc queries. It was a fascinating talk giving a peek into what must be a very exciting product to develop. Day 1, Session 4: Compile your own cloud with MirageOS, by Thomas Gazagnaire Unikernels are specialized OS kernels that are written in a high-level language and act as individual software components. A full application (or appliance) consists of a set of running unikernels working together as a distributed system. MirageOS is written in the OCaml (http://ocaml.org) language and emits unikernels that run on the Xen hypervisor. One may ask - whats the main advantage of unikernels? Unikernels win by allowing applications to access hardware resources directly without having to make repeated privilege transitions to move data between user space and kernel space. Unikernel OS are being attempted in more languages than just OCaml. There is HaLVM in Haskell, Ling in Erlang, OSv in Java and maybe more. This introduction to unikernels and perspective on Virtualization was superlative and I wish I could have absorbed more. Day 1, Session 5: Property based testing for functional domain models, by Debasish Ghosh I have been an avid reader of Debasish Ghosh\u0027s blog and books. They are rich both in theoretical arguments and practical advise. Was thus looking forward for this session. Debasis introduced ScalaCheck/QuickCheck to the audience. Since I have used ScalaCheck before the tool itself was not new. However, the theoretical underpinnings for property based testing were a big takeaway. To quote a few statements from the session that will stay with me - \"To test any sufficiently complex domain model with xUnit based testing will mean some corner cases will be missed\". And unit-testing, automated-testing is all about catching those corner cases \"Paramatricity tests more conditions than unit test suites ever will - Edward Knett\" The talk also included a intro to dependenty-types and parametric polymorphism. One key takeaway of attending conferences is coming to know of new books - and after this session \"Theorems for free!\" by Phil Wadler got added to my ToRead list. Straddling sessions - Session 6: Clojurescript \u0026 Om, and Code Jugalbandi. Session 7: Functional Groovy, and Learning from Haskell Eearlier in the day, Naresh Jain, the chief organiser of the conference, had advised the attendees to use what he called \"the law of two feet\" - the law asks the attendee you to get the most out of the conference by walking to the sessions even in-between, if required. Unable to decide which session to stay in for the last two sessions I decided to use this law! Om is a library for ClojureScript programmers. Vagmi provided a breezy intro to why ClojureScript makes React.js faster. And finally, why/how Om make ClojureScript faster by giving an example of DOM diffing and the showComponentUpdate() call Code Jugalbandi was a very interesting act between two programmers (playing the roles of Brahma and Krishna) to showcase interesting features like currying and pattern matching across languages. It was like a breath of fresh air to the otherwise usual way of sessions at a conference Groovy is a dynamic language on the JVM. Since I have never programmed in Groovy I was pleasantly surprised with its many capabilities in functional programming showcased by Naresha The Haskell experience session was filled with anecdotes that the speaker, Aditya Godbole, recounted from his workplace (and elsewhere) in trying to bring in healthy-code practice Day 2, Session 1: The role of Fear in language adoption, by Bruce Tate Bruce Tate\u0027s book \"Seven Languages in Seven Days\" was probably the first book I bought on my new Kindle. I was looking forward to hear from this Guru of Java and Ruby. The title added to the curiosity. Bruce kicked off the talk making some very thought provoking observations - Fear and Discovery are intertwined When fear lurks within the team, code commits go down! Bruce went on to draw a parallel between Geoff Moore\u0027s celebrated \u0027Technology Adoption Curve\u0027 and called for the audience to think of language adoption on similar lines. Languages die in the chasm... Just as frequency of technology waves frustrates businessmen, the frequency of language waves frustrates programmers (how many new languages did I come across just in 2014 so far - Wolfram, Swift, Hack... and I can go on...) Just like \u0027behaviour change\u0027 apps have a difficult time in adoption, significant syntax changed languages lose out in the mass programmer market. Java success due to its almost identical syntax to C is NOT incidental... And then there are language adoption curves and language paradigm adoption curves. The curves for Procedural, OOPS and Functional are much deeper, wider and steeper What are the fears for different consumers of languages: Paralysing Fear: Jobs, Cost Motivating Fear: Concurrency, Multi-core, Time-to-market Now here is a question to all career programmers - how many more years do we want to program in the C/C++ style syntax?? The next wave of language adoption will NOT be a big massive wave like Java. It will instead be a tsunami of many smaller waves composed of of the Scala\u0027s, Clojure\u0027s, Ruby\u0027s, Swift\u0027s, Ocaml\u0027s and many many more... Day 2, Session 2: Functional programming using Dyalog, by Morten Kromberg APL. I had never heard of it. The name of Ken Iverson sounded familiar and in league with Alonzo Church, Alan Turing, Haskell Curry et al. But nothing more... And what an eye-opener it was! If there was an award for the most mind-blowing session, then this one won the 1st, 2nd and 3rd places! The APL language is beyond words that this humble writer can conjure up. It takes the idea of functional programming to an all new level is all I can say (and yet its not logic programming like Prolog or likes). However the story of Dyalog as a company and its business came across as not any less astonishing. To be in software development business for over 40 years, going through the many industry upheavals (mainframes -\u003e unix/windows -\u003e cloud) and solving some of the most difficult problems in all streams of engineering and yet remaining totally unknown to most! Morten Kromberg, the CTO of Dyalog, had a interesting observation to share - \"All engineers take to APL easily except the software engineers\". Now that says it all about our computer science education system. Day 2, Session 3: Monads you already use, by Tejas Dinkar Next was a lightening talk by Tejas on giving a (yet another!) perspective on Monads. In a delightfully constructed talk Tejas presented the idea through a box analogy and thereby trying to simplify the understanding of monads for list, IO etc. There would not be a functional programmer on the planet who has not heard/read at least one video/blog on Monads without scratching his head in disbelief. It takes a certain bravado to attempt presenting the topic to a roomful of programmers at the functional conference! And Tejas did a splendid job of it. Day 2, Session 4: Purely functional data structures demystified, by Mohit Thatte Mohit\u0027s talk was based on Chris Okasaki\u0027s famed work on the topic \"Purely Functional Data Structures\". This is a deep topic. I have tried to read Okasaki\u0027s work and have found it hard to get past the first few chapters. It is in the same league as SICP. I was curious about how justice for such a voluminous work can be done in one session! And the first challenge was to try and formulate the problem definition - Mohit kept the audience glued as he demystified why it is tough to implement a correct, fully functional and performant implementation of common ADT\u0027s like Queue, List, Map etc. The next challenge was to explain the complex idea of structural sharing. The whole idea of persistence in data structures and performance is after all derived from structural sharing. One has to really try and attempt implementing common data-structures with structural-sharing to start comprehending the complexity that it introduces and the power behind the idea. That it becomes extremely complex to come up with even simple sounding list implementations when no side effects barrier is introduced is also an idea that needs real hands on work to grasp. Mohit\u0027s session was a fine effort to jog my memories of my few late night battle-losses with Okasaki and the illuminating world of persistent data-structures! Day 2, Session 5: Demystify functional jargons, by Mushtaq Ahmed Scala has multiple library APIs that are very well suited for certain design usecases. In this talk Mushtaq covered, with examples, the when-to, why-to and how-to use the Async, Await, Blocking/Future/Promise. In addition to these in-house Scala utilities he also demonstrated the usecase for Observables in the context of streams. He demonstrated an example of a simple application built using these to capture, filter and search on tweets. Day 2, Session 6: Object-functional programming: Beautiful unification or kitchen sink, by Rahul Goma Phulore This was a talk I was eagerly looking forward to. The reason was very specific - just days before this conference a couple of close friends (who were former colleagues) engaged me in long winding discussion on building large applications in Scala. Like all languages Scala has its pro\u0027s and con\u0027s. However opinions stand divided by a wide(ening?) chasm... I was looking forward to Rahul to give me some new perspectives. In a technically-engaging talk Rahul deconstructed the myth of Scala being a kitchen-sink. The very start of the talk was made intriguing when he proceeded to ask the audience 3 questions - (a) how many see the future in a purely OOPS world? (b) how many see the future in a purely functional world? (c) how many see the future in a hybrid of both? Almost no hands went up for question (a). A few enthusiastic hands went up for question (b). But almost ALL hands went up for question (c). That was significant food for thought in itself. Scala has been called the \"Grand unification of all programming languages\". It has also been called \"Vegetarian ham in chicken flavor\"... arguments like these have split the programming world into tribes of believers/unbelievers without significantly adding to the knowledge/understanding of either groups. Coming from a workplace where we use Scala predominantly I can testify this to be true by experience. But it pays great dividends to dive-in a little deep to understand just how Scala provides this grand-unification. Thats where real illumination is. How can functions be first-class objects? How does pattern-matching happen under the hood? The idea behind algebraic data types? How can using the mere term sealed lead to exhaustiveness checking leading to much higher type-checking kick-in at compile-time? How do mixins works? Rahul had it all covered. Day 2, Session 7: Methodologies, Mathematics, and the Metalinguistic Implications of Swift, by Daniel Steinberg How do we learn? Did we really learn programming when we first read a programming book? In my case, the first programming book I came across was, probably, \u0027C Programming\u0027 K\u0026R. And I did not learn a think even after months of reading and even typing the code in first few chapters! So, how do we learn? Lets leave programming for a moment. How did we learn math? How did we learn geometry? Were we able to see the problems and solutions? For example, did the area of a triangle just always mean, half multiplied by base multiplied by height, so much so that we proceeded to find the area of a triangle with sides {2,3,5}, OR we could see why/how the triangle\u0027s area was so? Daniel is the author of multiple books in the iOS Apps world. In earlier life, he had been a math teacher at high school. And he came across as a fabulous father to his young, learnful kids. Daniel urged us to think how we learn. And thereby also think about how we teach. We don\u0027t learn by knowing the rules. We don\u0027t learn when someone tells us a definition of something. We learn by realising things bit-by-bit. We learn by building small things and making them bigger. We learn by looking at things built by others. We learn slowly. We learn more by observation than by anything else. And Daniel taught this to a roomful of adults. By showing us Donald Duck cartoons and explaining us pythagoras theorem. I became a dad just few weeks ago - And I can only thank Daniel for this talk. I learnt immensely. Epilogue In between the sessions I had a chance to meet and discuss with Kishore Nallan, Debasis Ghosh, Venkat Subramaniam and many others. Except ThoughtWorks I don\u0027t think the conference had many representatives from the established big companies - which in itself was a boon as I got to hear and know so much happening in the startup/small-company world (that smaller companies are riding the wave of newer technologies must come as a no surprise). The value of a conference does not lie just in the sessions but also in realising a few things like these iin-between. I also ran into some of my old acquaintances and it was every second worth of my time to come to know of their new technical endeavours and the fate/trajectory of those that we shared long ago! Functional Conference was a wonderful conference. Am glad that I was there."
}, {
title: "Experiments with XML XPath libraries on JVM",
url: "https://bharath12345.github.io/posts/experiments-with-xml-xpath-libraries-on-jvm/",
date: "Sat 28 June 2014",
category: "posts",
content: "These days I mostly program in Scala. Few weeks ago I ran into a problem to search for data within fairly large XMLs. XPath and XQuery are the standard technologies to query XML\u0027s. JVM programmers have a choice of multiple libraries to choose from when it comes to XPath. One constraint in my problem was that the program to crunch these XML was a long-running one. So, apart from trying to make the search fast I had to make sure that the CPU/memory requirements were sane. On submitting a XPath search if a library forked many hundred threads, broke the XML into many hundred stubs thus consuming every single ounce of CPU/RAM at disposal on my machine, then it was simply a no-go. Even if such a library turned out to be an order of magnitude faster than the rest. Table of Contents Results Tabulated JVisualVM Graphs Code javax.xpath Saxon VTD Scala Epilogue A look at the XML-XPath JVM library landscape made me shortlist the following for a quick investigation - scala.xml - Scala\u0027s built-in parser javax.xml.xpath net.sf.saxon vtd-xml This post is a work-in-progress and I will refrain from drawing conclusions. As and when I find more, I shall add. Some passing reader may find the numbers helpful for some other cause in the wild. Now, the environment details - The approx size of XML\u0027s I used was ~ 70MB. That does not make it very large but the complexity of the structure can be the dark variable in XML processing. Even a 5MB XML with small elements, recursive lookups etc (those that people refer to as XML database) can be much harder to search within than a 500MB one which has a straight simple flow (say like Log4J Xml logs). The XML I used was neither as complex as a database or as simple as a log. It was more alike the configuration (more complex than tomcat web.xml but similar) XML files with fairly deep nesting All numbers are mean over run of 30 iterations. they should be treated as ballparks Tests were run on my 4core 8GB Mac OSX Mavericks Java version \"1.7.0_51\". Scala version \"2.11.0\" No cpu/memory hungry process running on the system while running the test. It was just a text editor, console, test application and operating system services after a fresh reboot Tests tried with 4 big buckets of Xmx setting - 512M, 1024M, 2048M, 4096M All numbers and screen captures are with jvisualvm. wanted to use jstat but got a little lazy One important consideration while choosing a XML library is the API. But that is project specific and I leave it out of this comparison. Results Tabulated Xmx512m \u00a0 Time Taken App CPU Usage GC CPU Usage App Heap Size Heap Used Eden collection count/time spent Old Gen collection count/time spent Eden pattern Survivor pattern Old Gen pattern scala.xml 240s 70-80% 20% 512M 250-300M 359/15.2s 303/3m18s either 0M or 170M not much usage between 170-340M javax.xml.xpath does not complete net.sf.saxon.xpath 67s 60-80% 20% 512M 250-300M 162/6.2s 123/39.3s 0-170M tall spikes consistent use of 57M * 2 stepwise between 0-340M vtd.xml 11s 26% 0.10% 500M 150-250M 13/138ms 9/262ms between 100-170M very less and infrequent between 80-240M Xmx1024m scala.xml 85s 70-80% 20% 1G 250-500M 299/36s 38/14s 0-340M tall spikes 100M consistent 80-600M neat triangles javax.xml.xpath 57s 50-70% 10-20% 1G 250-500M 197/14s 34/15s 0-340M tall spikes 100M consistent 200-600M neat triangles net.sf.saxon.xpath 49s 50-70% 10-20% 1G 250-500M 110/12s 34/15s 0-340M tall spikes 100M consistent 200-600M neat triangles vtd.xml 11s 30% 1-2% 300-800M 200-700M 11/66ms 6/204ms 200-300M 10M 400-600M Xmx2048m scala.xml 70s 70-80% 10-20% 2G 0.5-1G 154/27s 26/21s 0-680M tall spikes 100M consistent 200M-1G neat triangles javax.xml.xpath 59s 40-70% 10-20% 2G 0.5-1G 105/14s 23/17s 0-680M tall spikes 100M consistent 0.3-1.1G net.sf.saxon.xpath 39s 40-70% 10-20% 2G 0.5-1G 69/10s 18/8s 0-680M tall spikes 200M consistent 300-600M vtd.xml 11s 26% 0% 0.5-1.25G 0.5-1.25G 14/190ms 6/272ms 600M consistent 200M 1.3G no pattern JVisualVM Graphs javax.xpath CPU and Memory javax.xpath GC Saxon CPU and Memory Saxon GC VTD CPU and Memory VTD GC Scala XML Xpath CPU and Memory Scala XML GC Code javax.xpath import org.w3c.dom.Document; import java.io.IOException; import org.xml.sax.SAXException; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.parsers.ParserConfigurationException; import java.io.FileInputStream; import javax.xml.xpath.XPath; import javax.xml.xpath.XPathFactory; import java.util._; import javax.xml.xpath._ import org.w3c.dom.NodeList object Main extends App { try { val builderFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance(); val builder: DocumentBuilder = builderFactory.newDocumentBuilder(); val xPath: XPath = XPathFactory.newInstance().newXPath(); println((new Date()).toString) val compexp = xPath.compile(\"/mycompany/MyResourceSet/MyResource/MyResourceList/MyResource[@displayName=\u0027Dummy\u0027]\") def evalXml() = { val document: Document = builder.parse(new FileInputStream(\"sample.xml\")); val node = compexp.evaluate(document, XPathConstants.NODESET) node match { case n: NodeList =\u003e println(n + \" at \" + (new Date()).toString + \" len = \" + n.getLength()) case _ =\u003e println(\"typecast to NodeList failed\") } } val t1 = System.currentTimeMillis val i = 30 for(j e.printStackTrace(); } } Saxon import java.io._; import java.util._; import org.w3c.dom.NodeList; import javax.xml.parsers.DocumentBuilder; import javax.xml.parsers.DocumentBuilderFactory; import javax.xml.xpath.XPathFactory; import javax.xml.xpath.XPathExpression; import net.sf.saxon.xpath.XPathEvaluator; import net.sf.saxon.xpath.XPathFactoryImpl; import org.w3c.dom.Document; import javax.xml.xpath.XPathConstants; object SaxonEx extends App { val builderFactory: DocumentBuilderFactory = DocumentBuilderFactory.newInstance(); val builder: DocumentBuilder = builderFactory.newDocumentBuilder(); val factory = new XPathFactoryImpl(); val xc = factory.newXPath(); val xpathCompiler: XPathEvaluator = xc.asInstanceOf[XPathEvaluator]; val xstring = \"//mycompany/MyResourceSet/MyResource/MyResourceList/MyResource[@displayName=\u0027dummy\u0027]\" val expr: XPathExpression = xpathCompiler.compile(xstring); println(\"running SaxonEx:\" + (new Date()).toString) def evalXml() = { val document: Document = builder.parse(new FileInputStream(\"sample.xml\")); val node = expr.evaluate(document, XPathConstants.NODESET); node match { case n: NodeList =\u003e println(n + \" at \" + (new Date()).toString + \" len = \" + n.getLength()) case _ =\u003e println(\"typecast to NodeList failed\") } } val t1 = System.currentTimeMillis val i = 30 for(j VTD import com.ximpleware._; import com.ximpleware.xpath._; import java.util._; object vtd extends App { val vg: VTDGen = new VTDGen(); def loopvtd = { vg.parseFile(\"sample.xml\", false); val vn:VTDNav = vg.getNav(); val ap:AutoPilot = new AutoPilot(vn); ap.selectXPath(\"/mycompany/MyResourceSet/MyResource/MyResourceList/MyResource[@displayName=\u0027dummy\u0027]\"); val x = ap.evalXPath() if(x != -1) println(\"eval returned \" + x) else println(\"eval failed\") val value: Int = vn.getText(); if (value != -1) { val title:String = vn.toNormalizedString(value); println(title); } } val t1 = System.currentTimeMillis val i = 30 for(j Scala #!/bin/sh exec scala \"$0\" \"$@\" !# import scala.xml import scala.xml._ import java.util._ def findout(filename: String) = { val xf = xml.XML.loadFile(filename) val cec = (xf \\\\ \"MyResource\" filter ( _ \\\"@displayName\" contains Text(\"Dummy\"))) } println((new Date()).toString()) val t1 = System.currentTimeMillis val i = 30 for(j Epilogue VTD comes across as the fasted XPath of all. Saxon comes next. The standard library implementations of XPath by Java and Scala are much slower. The Scala implementation is not XPath at all and can just be called XPath like. The code is very simplistic to infer a lot from CPU/memory graphs. I have tweaked the code to get a little better inference and intuition. An interested programmer might do the same to get a better idea."
}, {
title: "Running Machine-Learning Assignments on my Laptop\u0027s Spark Cluster",
url: "https://bharath12345.github.io/posts/run-machine-learning-assignments-on-a-laptops-spark-cluster/",
date: "Mon 12 May 2014",
category: "posts",
content: "The latest offering of Coursera\u0027s popular course on Machine Learning by Andrew Ng started in the first week of March. The course requires Matlab\u0027s Octave to be used to solve the assignments. Apart from trying to solve the problems in Octave, I decided to solve the assignments in the programming language of my choice - Scala. This is a quick post on the why\u0027s and the how\u0027s. Why Scala, Spark and Distributed? As computing and web has grown, the size of data to process has grown much larger. To process these large volumes of data requires two things - (1) horizontal scalability/distribution (2) efficient usage of multicore compute. Languages like R and Octave are not built for either - that is, writing programs that run on a cluster and efficiently use all CPU/RAM is infeasible in these. They are good only for smaller datasets and POC (proof-of-concepts) on large production-like datasets. Large datasets and continuous data-streams requires software design and programming in compiled languages like C, JVM-based, Haskell etc. My preference is the JVM based languages. In the world of JVM, there are multiple open source frameworks that provide a platform to write statistical computing algorithms that run on a cluster, for example - Apache Spark Apache Mahout H20 I chose Spark. Spark is written in Scala. Its MLib implementation includes many of the popular/simpler ML algorithms. Spark makes use of Mesos or HDFS for distribution support. It started as research project at AMPLabs at UC Berkeley and is now incubated at Apache. Spinning up a Spark Cluster with Vagrant and Docker Running a cluster on laptop firstly requires it be computationally well powered. I use a 4-CPU \u0026 8GB-RAM Mac OSX machine. I would suggest that this is the minimum configuration. The second requirement is to have separation of the virtual machines that form the cluster from the system that runs it. I have found Vagrant to be a superb tool to run configurable virtual machines which can be shared with ease. Vagrant uses VirtualBox. I created a VM with Ubuntu 14 Trusty and allocated 2-CPU and 4GB-RAM for it exclusively on my laptop. The next idea is to run multiple VM\u0027s on this Ubuntu machine using Docker. Now why Docker? - If Vagrant provides heavyweight VM abstraction then Docker provides lightweight ones. The idea is to run multiple Docker based lightweight linux VM\u0027s on this Vagrant Ubuntu VM - this is because a Spark cluster needs multiple nodes like a Master, workers and Namenode (for HDFS). One can run Docker directly on the native machine using something like a TinyCore Linux OS. The steps to do so can be found on Docker\u0027s website. However it is better to avoid that and instead rely on Vagrant. There are couple of reasons for this - Tiny Core Linux\u0027s contents are not persisted across reboots. Since we would be coding on these VM\u0027s, a loss of contents is scary Allocating CPU/RAM to multiple nodes directly running in a laptop is unclean. Its not easy to achieve this CPU/RAM distribution in Docker too (along with shared folder support). Vagrant really comes handy to alleviate these shortcomings. Further, it is super easy to suspend a Vagrant VM and the whole cluster status will be persisted as-is... I can\u0027t think of anything more cool than that on the planet! The Steps My Vagrantfile is a small one. It uses Ubuntu 14.04 Trusty and allocates 4GB RAM and 2 CPU core exclusively - VAGRANTFILE_API_VERSION = \"2\" Vagrant.configure(VAGRANTFILE_API_VERSION) do |config| config.vm.box = \"shrink0r/ubuntu-trusty-server-x64\" config.vm.provider \"virtualbox\" do |v| v.memory = 4096 v.cpus = 2 end end SSH into the Vagrant box and clone this repository from AmpLabs to get going with the next Docker step Post cloning, run the command - sudo ./deploy/deploy.sh -i amplab/spark:0.9.0 -w 1 to bring up the cluster with 1 worker. We don\u0027t need more than 1 worker on our simulated cluster. And even with only one worker, there would be 4 nodes in this cluster (master, worker, namenode, domain name server). Expect this command to take quite some time to complete The next necessary step is to configure name resolution. The nameserver IP to put in /etc/resolv.conf would be shown at the end out console output of command run in step-3 Follow the steps on the Github page of AmpLabs docker-scripts repo to make sure that a Scala shell can be attached and the example run The next step is to download Hadoop and place it in the Vagrant system. Hadoop is required to interact with the HDFS (we need a client). I used Hadoop v1.2.1 The first assignment of Machine Learning course uses a txt file (ex1data1.txt) as data. The idea now is to place this on HDFS and run Spark linear regression on it. The HDFS in the AmpLabs cluster is created by a user called \u0027hdfs\u0027. So we need to mimic a user with the same name on the Vagrant client system (this is a hack). So create a new user... my interaction - $ sudo useradd -m hdfs $ sudo passwd hdfs Enter new UNIX password: Retype new UNIX password: passwd: password updated successfully $ su hdfs Password: $ whoami hdfs $ Next transfer the ex1data1.txt from the local filesystem to HDFS. Use the hadoop program from the downloaded Hadoop bundle (its in bin) to talk to the HDFS - hdfs@packer-virtualbox-iso:/vagrant$ hadoop-1.2.1/bin/hadoop fs -fs hdfs://master:9000 -mkdir /bharath hdfs@packer-virtualbox-iso:/vagrant$ hadoop-1.2.1/bin/hadoop fs -fs hdfs://master:9000 -put /vagrant/data/ex1data1.txt /bharath hdfs@packer-virtualbox-iso:/vagrant$ hadoop-1.2.1/bin/hadoop fs -fs hdfs://master:9000 -ls / Found 2 items drwxr-xr-x - hdfs supergroup 0 2014-05-17 17:07 /bharath drwxr-xr-x - hdfs supergroup 0 2014-05-17 12:06 /user hdfs@packer-virtualbox-iso:/vagrant$ hadoop-1.2.1/bin/hadoop fs -fs hdfs://master:9000 -ls /bharath Found 1 items -rw-r--r-- 3 hdfs supergroup 1359 2014-05-17 17:07 /bharath/ex1data1.txt So by now, we have a working Spark cluster and have placed our data on its HDFS. The next step is to write a Spark application. My application is called sparkling and its on github here. You may clone the repository onto your Vagrant Ubuntu box. You will need sbt to compile this project. The compilation could take some time. So far I have written just 2 programs in this project. There is one called \"Test.scala\" which does a simple line-count of the file placed on HDFS in the previous step. You may want to run this, it should print a count of 92 mixed in with a lot of java-logging output. If this program worked then you can run the other program \"LocalFileLinearRegression.scala\". The command to run these from the sbt prompt is, quite simply - ``` run-main in.bharathwrites.sparkling.LocalFileLinearRegression ``` 10. Instead of running the client program from the sbt prompt one can build a fat JAR using the assembly plugin. Doing so, one can run the client program from the command-line using the well known java -cp jar-name main-class PS: My project on github (sparkling) is my playground to learn spark. I will keep modifying the code in the coming days. So you may want to once read the code to check and see if it makes sense... as I have a tendency to check-in intermediary non-working code also! :)"
}, {
title: "Code Retreat",
url: "https://bharath12345.github.io/posts/code-retreat/",
date: "Mon 14 April 2014",
category: "posts",
content: "Last Saturday I happened to go to my first code retreat. Hosted by Multunus and conducted by the master programmer Venkat Subramaniam, it was a very learnful experience. Here\u0027s a sneak-peek of the event from my eyes. The Format Code retreats are day long coding fests. Programmers take a crack at solving Conway\u0027s Game of Life in multiple short sessions. Each session is for 45 minutes. Developers work in pairs of two (miniature extreme programming sessions). Every session starts with a clean slate, that is NO previous code at all. TDD is encouraged. After every 45 minute session is a 15 minute standup where everyone get to share the experience and things learnt. And at the end of the standup the group may decide to impose some constraints for the next session... like not-using-classes, no-global-state, no-returns-from-any-methods etc etc. The choice of programming language, frameworks used and design is all left to the pairing team. This meetup had close to 25 programmers who kept up the hacking whole day. Generally the 45 minutes time slot is insufficient to solve the puzzle along with writing tests. Especially if one is a newbie to the puzzle or language chosen or design idea. However, the the learning is not in solving the puzzle itself... but it lies in the experience of trying to do so within the setting. One can read more about the format and structure of Code Retreat at the community website - http://coderetreat.org My Experience In 5 sessions across the day, I, along with my partner, programmed in 4 languages - Scala, JavaScript, Python and Ruby! Here is a brief on each of my sessions... My first session was in Scala. My partner and I both had prior experience with the language. And thankfully my partner had some perspective and prior experience with the problem too. In a quick discussion we zeroed in on a subset of the problem we wanted to tackle (neighbour-finding in Game of Life) and agreed on a simple design. We leapt into coding starting with writing the tests. The test part straightaway exposed my poor knowledge of ScalaTest. Late while writing the code we got stuck in trying to write object equality in Scala - something I could do with my eyes closed in Java. That was the second expos\u00e9. We programmed on my partner\u0027s system and he used gradle - which was a first for me. JavaScript was common lingo with my next partner in the second session. Neither of us had used any testing frameworks in JS ever. That straightaway was an expos\u00e9! Since we did not have time to learn a new framework we decided to hand-code the tests. I don\u0027t remember the code now but we wrote some code that seemed to solve something. And along the way I re-discovered the right way to write inner functions in JavaScript. The next session had me pairing with a guy who wanted a sneak-peek into Scala (he had been a Java guy). This time we used my laptop to code. In order to give him a quick flavour of the language I decided to not use any IDE\u0027s and used a text-editor. I wanted to stay clear of ScalaTest and Scala\u0027s build system - so as to give my partner a good look at some mass of code and not distract him with externalities. We wrote code + tests in a single Scala script file. While coding I had this nagging realisation that I had not understood the problem well enough and wanted some thinking-time away from coding and just thinking about the puzzle. Lunch was approaching... so I decided to think hard and understand the problem better during lunchtime :-) Post-lunch I wanted to approach the problem differently from both design and language perspective. Was delighted to find a partner who could code in Python. This guy had put in some good think-time into the problem too and the approach we came up with was very refreshing for me! He coded away in Python which I found easy to follow and we wrote a lot of code. We were quite close to solving one of the situations in the game when the time ran out. Writing so much code to solve the problem meant that the test code was minimal. However I was beginning to appreciate the complexity and immense freedom of design that this simple looking problem posed at us. There were lot of Ruby developers in the group and I was very curious to get to see this language. So I chose a Ruby-man for the last session. This guy was much younger to me but surprised me by his thorough approach to TDD. He insisted on evolving the test code almost simultaneously with the main code and I must admit it initially frustrated me. I would have wanted to write a blob of test code and then write a blob of solution code and keep alternating giving good time to each. But my partner, who was the one coding in Ruby, would not have any of it. He requested that we keep shifting gears between the test-code and solution-code for every few lines of code. I knew he had a good point in what he was suggesting and asked if he could really program like this in his work time. His answer in affirmative was very convincing and I silently appreciated the young programmer\u0027s discipline. Epilogue Its not everyday that I force myself to think on good programming problems and this was an excellent one. That itself made me immensely happy. I was split on approaching the problem from a bottoms-up or top-down approach - which had me thinking about an aspect of design that I had not thought for some time. I don\u0027t remember the last time doing pair programming - its a fabulous thing and want to grab every opportunity of doing so in future. The code retreat certainly pushed me away from my comfort zone and thats a great way to learn. Having not solved the problem nags me. One of these days I will sit down to write code that solves Game_Of_Life to at least some extent from few approaches that are brewing in my head. Finally I recommend Code Retreat to all. Try to get yourself a booking the next time it is happening somewhere nearby. Last but not the least - heard some wonderful thoughts by Venkat on the endeavour of software development. And saw a wonderful startup in Multunus at its works. The day was delight - a Big Thank You to all my partners and all those who made the event possible."
}, {
title: "Play2 Application on Wildfly: Why and How",
url: "https://bharath12345.github.io/posts/play2-on-jboss-wildfly/",
date: "Thu 13 March 2014",
category: "posts",
content: "JavaEE (v5 and v6) has a commanding presence in both marketshare and (developer) mindshare in the enterprise software world. The specifications are well thought-out, battle-tested and highly relied upon. I started using JavaEE (v5) way back in 2007 with JBoss 4.x. The latest release, JavaEE-7, which was released close to a year ago brings with itself a lot of worthy changes to the specs and impl. To bring myself up to speed on it I went through few books and attended a conference (JUDCon, Bangalore). But I have also been coding and acquainting myself with Typesafe\u0027s Scala reactive stack. These two stacks are bound to compete with each more and more in the coming days. However I feel, they can be used in applications in complementary ways when carefully designed. The competition and challenge to JavaEE-7 stems from two tough requirements - Table of Contents Web Tier In JavaEE - The Loose Brick? Play Framework HOWTO - Play2 on Wildfly to interop with JavaEE Horizontal scalability Near real-time persist/process/view of ever increasing data volumes The JavaEE stack is broadly split into 3 tiers - web, business and persistence. JSF (broadly, including expression-lang, JSTL, JSP and Servlets) is the technology of choice (per the specs) in the web tier. And JSF, to me, seems most vulnerable of not being able to raise up to the above mentioned two challenges. JSF does feel like the loose brick in the JavaEE stack. And it feel ever more so after spending some time with the Play Framework! Web Tier In JavaEE - The Loose Brick? This was a recent tweet by Peter Thomas - Now to a quick primer on frameworks in JavaEE\u0027s web-tier. I like to group the Web Tier of JavaEE applications into 3 groups per a broad grouping of library\u0027s goals. A quick note on each of these... Component Frameworks: Component frameworks like JSF are suited for parts of the application with lot of forms and CRUD operations. JSF is an aggregate of multiple technical pieces which include facelets, expression language, jstl, converters, listeners, validators etc. JSF helps build composable UI components with server side validation in a scalable way It abstracts away a lot of state information in its stack which is not good for building UI components that serve a lot of read only and voluminous data. Tasks like JSON transformation within JSF are not efficient at scale Now, rarely do programmers get completely satisfied with the component library within JSF. So they use the richer component frameworks (above and beyond JSF) like Apache Wicket and Tapestry. And for dynamic pages with lot of AJAX there are frameworks like RichFaces and PrimeFaces which provide features atop JSF Action Frameworks: For read-only and voluminous data handling atop servlets action frameworks are preferred which explicitly tie to the HTTP request/response cycle. Action frameworks typically implement the famous MVC pattern for clear separation of concerns. So applications tend to use frameworks like Struts, SpringMVC etc Standalone, Proprietary Frameworks: These are the ones that are unbelievably beautiful for quick-small projects and unbelievably ugly for large ones. Technologies like JSP, GWT, Dart et al. These are just pure evil from enterprise products perspective Play Framework With me having done quite a bit of programming in JSP and JSF, I found Play to be fresh breath of air. Web application programmers must spend some time reading and understanding this blog by Guillaume Bort on reasons behind the decision to not write yet another framework atop Java\u0027s HttpServlet. My experience with Play has been by building a lookalike for my blog with it (hosted on Heroku here). I have built my blog on NodeJS and RubyRails as well - and honestly, it took much lesser time to build it with Play. But more important is the question of should enterprise web-tiers be programmed with Play? Is Play up to the mark for projects of development scale and complexity? My answer is a thumping YES!! Let me list the specific features that I found especially useful and important - Scala templating with compile time type safety - I have found UI composition to be very intuitive and much simpler than JSF (JSF\u0027s composability really feels like a mess when compared with Play!) Crisp way to program with Futures and Async to handle action endpoints Websockets - Futures, Async - much better API for streaming data than the WebSocket Spec in JavaEE Stateless. Easy to use with Akka Marshalling/unmarshalling of JSON data without reflection which provides huge performance improvement Explicit, clean server side routing methodology (what a mess this is in JavaEE where programmers often mix annotations, xml and sometimes also bring in client-side routing unnecessarily) No server side sessions at all! Sessions in Play are all made available through cookies and HTTP headers. So no server side context to worry about Built-in build-time JavaScript compilation WebJars Less verbose Scala code. The joy of composable functional programming Hot deployment during development Netty underneath - performance not an issue Cloud deployment ready (Heroku, Cloudbees support it) This one is a excellent presentation (and code) by Yevginy Brikman of LinkedIn (LinkedIn uses Play! for multiple web apps in its stack). The title is apt - building web apps that are composable and streamable. More and more, enterprise applications have UI requirements of the kind described here. And building these using JSF/Java would be too complex a web project and IMHO not worth the trouble! HOWTO - Play2 on Wildfly to interop with JavaEE For very good reasons enterprise applications are generally hosted on application containers. And application containers mostly come built with a servlet container for web frontend. Now since Play2 is not Servlet based does it mean using it in enterprise applications straightaway get vetoed? Not necessarily. If engineers have a little chutzpah, the gap can be bridged. Here is how I was able to host my Play2 application on JBoss-Wildfly - The plugin to study and use for the task is the Play2War. Play2War builds a WAR out of a Play2 application. I was then able to deploy this WAR of my Play app on Wildfly and get it to work After following the usage/configuration instructions from the Play2War\u0027s GitHub page, the first thing to do is to configure the excludes. A number of JARs that get packaged using Play2War clash with JBoss\u0027s modules and thus need to be excluded. Here is a quick list of such JAR\u0027s from my project - Artifact GroupID Version In Play v2.2.1 Version In Wildfly v8.0.0-Final Newer Google Guava com.google.guava 14.0.1 16.0.1 Wildfly Jackson Core, Annotations and Databind com.fasterxml.jackson.core.jackson* v2.2.2 v2.3.0 Wildfly H2 Database com.h2database.h2* v1.3.172 v1.3.173 Wildfly Apache Commons Codec org.apache.commons.codec v1.6 v1.9 Wildfly Apache Commons IO org.apache.commons.io v1.3.2 v2.4 Wildfly Netty io.netty v3.7.0 v4.0.15 Wildfly Hibernate Commons Annotations org.hibernate v4.0.2 v4.0.4 Wildfly Hibernate Core, Entity Manager org.hibernate v4.2.3 v4.3.1 Wildfly Hibernate Validator org.hibernate v5.0.1 v5.0.3 Wildfly Javaassist org.javaassist v3.18.0 v3.18.1 Wildfly JBoss Logging org.jboss.logging v3.1.1 v3.1.4 Wildfly JBoss Transaction javax.transaction.api v1.0.0 v1.0.1 Wildfly Yaml org.yaml.snakeyaml v1.12 v1.13 Wildfly Antlr org.antlr v2.7.7 v2.7.7 Same dom4j org.dom4j v1.6.1 v1.6.1 Same Postgres org.postgres v9.1-901 v9.1-901 Same javax.validation v1.1.0 v1.1.0 Same Joda time org.joda.time v2.2 v1.6.2 Play Apache Commons Lang org.apache.commons.lang v3.1 v2.6 Play HttpCore org.apache.httpcomponents. v4.3.1 v4.2.1 Play HttpClient org.apache.httpcomponents. v4.3.2 v4.2.1 Play Hibernate JPA javax.persistence.api v1.0.1 v1.0.0 Play Asm asm.asm v4.1 v3.3.1 Play jcl-over-slf4j org.slf4j v1.7.5 v1.7.2 Play jul-to-slf4j org.jboss.logging v1.7.5 v1.0.1 Play slf4j-api org.slf4j v1.7.5 v1.7.2 Play Xerces org.apache.xerces v2.11 v2.9.1 Play One can see from the above table that versions of many artefacts is newer in Wildfly. I decided to use the newer Wildfly versions and exclude these from the WAR generated by Play2War. So included this filtering statement in my project\u0027s build.sbt file - Play2WarKeys.filteredArtifacts ++= Seq( (\"com.google.guava\", \"guava\"), (\"com.google.code.findbugs\", \"findbugs\"), (\"com.fasterxml.jackson.core\",\"jackson-annotations\"), (\"com.fasterxml.jackson.core\",\"jackson-core\"), (\"com.fasterxml.jackson.core\",\"jackson-databind\"), (\"com.fasterxml\",\"classmate\"), (\"commons-codec\",\"commons-codec\"), (\"commons-io\",\"commons-io\"), (\"org.hibernate\",\"hibernate-commons-annotations\"), (\"org.hibernate\",\"hibernate-core\"), (\"org.hibernate\",\"hibernate-entitymanager\"), (\"org.hibernate\",\"hibernate-validator\"), (\"org.hibernate.common\",\"hibernate-commons-annotations\"), (\"org.hibernate.javax.persistence\",\"hibernate-jpa-2.0-api\"), (\"javax.validation\",\"validation-api\"), (\"javax.persistence\",\"persistence-api\"), (\"javax.transaction\",\"transaction-api\"), (\"org.jboss.spec.javax.transaction\",\"jboss-transaction-api_1.1_spec\"), (\"org.jboss.logging\",\"jboss-logging\"), (\"org.jboss.logmanager\", \"log4j-jboss-logmanager\"), (\"org.springframework\",\"spring-beans\"), (\"org.springframework\",\"spring-context\"), (\"org.springframework\",\"spring-core\"), (\"postgresql\",\"postgresql\"), (\"org.javassist\",\"javassist\"), (\"org.yaml\",\"snakeyaml\"), (\"antlr\",\"antlr\"), (\"com.h2database\",\"h2\"), (\"dom4j\",\"dom4j\"), (\"tyrex\",\"tyrex\") //(\"com.jolbox\", \"bonecp\"), //(\"io.netty\",\"netty\"), ) Next thing to do is to use a jboss-deployment-structure.xml where one can specify the newer Wildfly modules of these artefacts to be used for the deployment. This deployment descriptor should be created in the following path in the Play2 project - app/ conf/ project/ war/ |--WEB-INF |--jboss-deployment-structure.xml I used the following setting in this xml - \u003cjboss-deployment-structure\u003e \u003cdeployment\u003e \u003cdependencies\u003e \u003cmodule name=\"com.google.guava\"/\u003e \u003cmodule name=\"com.fasterxml.jackson.core.jackson-annotations\"/\u003e \u003cmodule name=\"com.fasterxml.jackson.core.jackson-core\"/\u003e \u003cmodule name=\"com.fasterxml.jackson.core.jackson-databind\"/\u003e \u003c!--module name=\"com.h2database.h2\"/--\u003e \u003cmodule name=\"org.apache.commons.codec\"/\u003e \u003cmodule name=\"org.apache.commons.io\"/\u003e \u003c!--module name=\"io.netty\"/--\u003e \u003cmodule name=\"org.hibernate.commons-annotations\"/\u003e \u003cmodule name=\"org.hibernate\"/\u003e \u003cmodule name=\"org.javassist\"/\u003e \u003cmodule name=\"org.jboss.logging\"/\u003e \u003cmodule name=\"org.yaml.snakeyaml\"/\u003e \u003cmodule name=\"org.antlr\"/\u003e \u003cmodule name=\"org.dom4j\"/\u003e \u003cmodule name=\"org.postgres\"/\u003e \u003cmodule name=\"javax.validation.api\"/\u003e \u003cmodule name=\"javax.persistence.api\"/\u003e \u003cmodule name=\"javax.transaction.api\"/\u003e \u003cmodule name=\"org.glassfish.javaeetutorial.helloservice-api\"/\u003e \u003cmodule name=\"org.jboss.log4j.logmanager\"/\u003e \u003c/dependencies\u003e \u003c/deployment\u003e \u003c/jboss-deployment-structure\u003e I wanted to use Hibernate in my Play project. And I wanted to interact with EJB\u0027s and ActiveMQ messaging service in Wildfly. Firstly, this is very much possible. To use hibernate, one has to create the persistence.xml in the following structure - war |--WEB-INF |--classes |--META-INF |--persistence.xml Since the WAR will be deployed in Wildfly, make sure to read the persistence related docs from its wiki. 5. Logging - Play2War\u0027s GitHub wiki has a separate section for configuring logging with Wildfly. Make sure to read that. It basically asks for including this dependency in the build.sbt - \"com.github.play2war.ext\" %% \"redirect-playlogger\" % \"1.0.1\" With this configuration the WAR built by Play2War for my Play2 project was around 50MB in size. One has to use JNDI lookup to access the EJB\u0027s in container. The looked up EJB\u0027s can be cached in a Scala Object to avoid repeats One shortcoming that I realised while doing this work was that websockets will not work in this setup. Play2 uses Netty as is HTTP server and Wildfly uses Undertow. The websocket implementation in Wildfly (per the Websocket 1.0 spec) could be closely tied to to Undertow - but I have not read Wildfly\u0027s code to say so with certainty. Or maybe if one can make Wildfly to use Netty instead of Undertow as the underlying HTTP server then the websocket communication as provided by Play2 should become naturally available. Anyway, this is one shortcoming one has to put up if one takes this route The Final Word - I really feel Play2 will be a really good fit even in the JavaEE stack a few years down the line when some more bridges will appear around it to make it easily compatible to JavaEE application servers. It can be done even now as I found out. However one should take this plunge very cautiously (at least when using Wildfly). But to me, this is the right way to go... and the right way is generally never easy!"
}, {
title: "My Scala Application: Real-time Twitter Volume Grapher For Indian Elections 2014",
url: "https://bharath12345.github.io/posts/my-scala-application---twitter-volume-grapher-for-indian-election-personalities/",
date: "Thu 27 February 2014",
category: "posts",
content: "The Indian general elections are around the corner. For software engineers, this time around, there is data to play with and try to predict the outcome. Among all, the data from the social media giants - Twitter and Facebook - is easily accessible for analysis. Though social media may not be the right barometer to judge voter sentiments in a country as big and diverse as India, it is nonetheless a very tempting datasource for anyone curious. So couple of days ago I decided to do a small project - to simply chart the volume of tweets with strings like \"modi\", \"rahul\", \"kejri\" and \"india\" in it. I thought just a graph of volumes by itself will be interesting to see. So here I present the v1.0 of my Indian-general-elections-social-media-tracker! Table of Contents The Application 3 Seconds Tweet Aggregate Grapher 30 Seconds Tweet Aggregate Grapher 5 Minutes Tweet Aggregate Grapher 30 Minutes Tweet Aggregate Grapher 3 Hours Tweet Aggregate Grapher Design, Code and Logic WebSocket Addendum Final Note The Application The application has 5 different dashboards with a URL for each. Each of these 5 dashboard\u0027s have 4 graphs - one for each string (india/modi/rahul/kejri). Here is a quick summary of each dashboard - 3 Seconds Tweet Aggregate Grapher URL: http://bharathplays.herokuapp.com/twitter/elections/0 Dashboard Sample Image: Details: In this graph, a new data-point get created every 3-seconds. It appears as a dot on the chart. A mouse over of the dot shows the exact time and value of the data-point The title of the four grid\u0027s shows the string the graph is for. For example, the title of the graph showing the line chart for string rahul has the title Twitter Trends Graph for rahul x-axis is time. y-axis is number-of-tweets 30 Seconds Tweet Aggregate Grapher URL: http://bharathplays.herokuapp.com/twitter/elections/1 Dashboard Sample Image: Details: In this graph a new data-point gets created on the chart every 30 seconds Refer to the details of 3-seconds chart (above) for other info 5 Minutes Tweet Aggregate Grapher URL: http://bharathplays.herokuapp.com/twitter/elections/2 Dashboard Sample Image: Details: In this graph a new data-point gets created on the chart every 5 minutes (300 seconds) Refer to the details of 3-seconds chart (first one above) for other info 30 Minutes Tweet Aggregate Grapher URL: http://bharathplays.herokuapp.com/twitter/elections/3 Dashboard Sample Image: Details: In this graph a new data-point gets created on the chart every 30 minutes (1800 seconds) Refer to the details of 3-seconds chart (first one above) for other info 3 Hours Tweet Aggregate Grapher URL: http://bharathplays.herokuapp.com/twitter/elections/4 Dashboard Sample Image: [TBD] Details: In this graph a new data-point gets created on the chart every 3 hours (10800 seconds) Refer to the details of 3-seconds chart (first one above) for other info Design, Code and Logic The code is on GitHub here It uses Play! Framework\u0027s capabilities for all UI work which includes templates, URL-routing and WebSockets communication To bind to Twitter\u0027s stream firehose it uses Spray.IO All the code is in Scala. It does not use threads and uses the actor-method of concurrency using Akka Play\u0027s capability to do async websocket concurrent broadcast is leveraged It connects to filter API among the many Twitter\u0027s streaming APIs The part of code which connects to Twitter\u0027s streaming APIs and retrieves individual tweets is an extension of this example by Jan Machacek This application does NOT do sentiment analysis but is a brute-force volume grapher. So a higher count for a candidate does not imply positive popularity but just that his name is trending more. A keyword string, say Rahul, might appear anywhere in the tweet. That is, it could be anywhere in the text, or may be a part of the hashtag, or may even be part of a user handle that appears in the tweet While trying to find a match, I convert the whole tweet to lowercase and then use Java\u0027s String contains() to find a match. So the matching is case insensitive - the count for string Rahul includes those for rahul, raHul, rahuL etc. The matching string could also be a substring. So modi will match modified, moditva, amodi, etc. However I took a dump of over 1000 tweets to see how many of those captured did NOT belong to Indian elections - only to find that almost 80% of all tweets captured did concern these candidates and hence elections (you have to take my word on that!) One of the main motivations behind the design is to keep it lightweight. Twitter data is voluminous as can be seen by the counts. So the challenge is to serve a huge number of web-clients along with processing the incoming data. By removing the HTTP request-response loop for each update of the graph, a potentially big saving is achieved. Further, data for all 4 graphs in the dashboard is multiplexed over a single WebSocket channel. So every browser client has a single WebSocket channel to the server. This again is a big saving, since, if AJAX were used, then to update each of the 4 graphs would have required one client-to-server call each - which is very very expensive as the number of browser-clients increase Actors are a beautiful message-passing abstraction which make erstwhile tasks like managing threads and pools redundant. Please refer to to Akka documentation to know about this paradigm of programming The whole application is hosted on Heroku. Heroku allows hosting of Play 2.0 applications and also provides WebSocket support. So the cost of running this application to me is free! :-) The design flow essence is captured in this image - WebSocket Addendum The graph may not appear if you are a behind a proxy which does not tunnel WebSockets (like behind some of the office networks). Also if a firewall blocks WebSockets. But in case you run into any of these issues, then you could use your mobile device to see the dashboard. Here is the screenshot from my Android Samsung S2 on my home Wifi. I also checked that the graphing works on my Airtel 2G network fairly well too (the dots in the image below are some mess-up by the mobile screenshot tool) Final Note These graphs are just volumetric. I plan do some simple sentiment analysis next. However, by looking at the graphs and tweets behind them, it is heartening to see the order of popularity of each string. India is most popular among the four but next comes Modi and it is generally not far behind. Rahul seems to appear more than Kejri but both these strings trail a long way behind Modi. With me being a diehard Sri Narendra Modi supporter, these graphs and numbers certainly make me happy and hopefully bode well for the good times to come for my country :-)"
}, {
title: "Computing Laws, Theorems and Aphorisms",
url: "https://bharath12345.github.io/posts/computing-laws-theorems-and-aphorisms/",
date: "Sun 23 February 2014",
category: "posts",
content: "There are plenty of laws, theorems and aphorisms out there, apart from Moore\u0027s and Murphy\u0027s, that computer people could use. They sometimes come handy in meetings and emails! Just using them could, at times, mean standing on the shoulders of giants. I plan to keep this as an ongoing post... shall keep adding till it gets too long! Laws and Theorems Amdahl\u0027s Law: used to find the maximum expected improvement to an overall system when only part of the system is improved Metcalfe\u0027s Law: the value of a network is proportional to the square of the number of connected users of the system (n2) Conway\u0027s Law: Any piece of software reflects the organizational structure that produced it Grosch\u0027s Law: Computer performance increases as the square of the cost. If computer A costs twice as much as computer B, you should expect computer A to be four times as fast as computer B Little\u0027s Law: The long-term average number of customers in a stable system L is equal to the long-term average effective arrival rate, \u03bb, multiplied by the (Palm\u2011)average time a customer spends in the system, W; or expressed algebraically: L = \u03bbW. Gustafson\u0027s Law: Any sufficiently large problem can be efficiently parallelized Gilders Law: bandwidth grows at least three times faster than computer power - See more at: http://www.netlingo.com/word/gilders-law.php#sthash.zGRjAR00.dpuf Mooers Law: An information retrieval system will not be used if it is more painful for the user to have information than not to have it Parkinsons Law: Programs expand to fill all available memory Millers Law: All discussions of incremental updates to Bugzilla will eventually trend towards proposals for large scale redesigns or feature additions or replacements for Bugzilla Wirths Law: Software gets slower faster than hardware gets faster Brooks Law: Adding manpower to a late software project makes it later Greenspuns Law: Any sufficiently complicated C or Fortran program contains an ad hoc, informally specified, bug-ridden, slow implementation of half of Common Lisp Hintjens Law of Concurrency: e = mc2, where, e = effort, m = mass of code, c = colliding threads Aphorisms and Quotes Carnegie Mellon\u0027s Professor Richard Pattis\u0027s collection Yale\u0027s tribute to Alan Perlis - Epigrams in Computing Edsger Dijkstra\u0027s, Quotes David Wiseman\u0027s, Laws of Computing Fred Brooks\u0027s, Quotes Don Knuths, Quotes Tony Hoare\u0027s, Quotes Quotes listed on Paul Graham\u0027s site Murphy\u0027s Computer Laws"
}, {
title: "Streaming Twitter On Play + Spray Scala App",
url: "https://bharath12345.github.io/posts/streaming-twitter-on-play--spray-scala-app/",
date: "Wed 29 January 2014",
category: "posts",
content: "Working on Scalog I decided to write a quick program couple of days ago to see the super trending twitter hashtag - #ArnabVsRahul. I initially tried to follow the hashtag on TweetDeck but found that the arrival rate of new tweets simply did not allow me to read. Wanted a way to read the tweets page-by-page with each page reloading when I refresh. So wrote a program to do so - A twitter stream listener! And yesterday pushed the code to GitHub and this is a quick post on it. The code itself can be found here. Am building Scalog using Play2! framework in Scala. The blog hosted on Heroku can be accessed by this link - http://bharathplays.herokuapp.com/. The blog itself is just a Scala replica of my Jekyll and NodeJS blogs. Nothing special in the blog part. To listen to the tweets I use Spray\u0027s HTTP actor listeners. The Spray HTTP client connects to Twitter\u0027s stream service URL and waits for the chunked responses on a persistent connection. Every new tweet arrives as a chunk. I simply push the tweets to a ByteArrayStream and read it later in Play\u0027s streaming to send it to a requesting browser. My twitter streamer can be accessed by using this link stub - http://bharathblogs.herokuapp.com/twitter/go/ followed by the term to search. For example to read tweets with ArnabVsRahul one will have to use the URL - http://bharathplays.herokuapp.com/twitter/go/ArnabVsRahul The URL needs to refreshed once for the streaming to actually begin. The first access returns no data just as a check. And not necessarily all search strings will produce results - so its better to search for permanently high trending strings like \"india\" in case you see no data. This blog came in handy while trying to understand how to stream twitter data using Spray\u0027s HTTP client\u0027s capabilities."
}, {
title: "My Scala Projects In The Making",
url: "https://bharath12345.github.io/posts/scala-projects-in-the-making/",
date: "Wed 25 December 2013",
category: "posts",
content: "Over the last year I often heard my friends say the era of MOOC was truly upon us. It was only on taking up couple of Coursera courses did I realise it fully. They have been eye-opening many times over (and extremely rigorous). Would particularly recommend these two to anyone wanting to understand programming for the multicore, realtime, big-data world - Table of Contents GBridge ScaLog WebFlow Functional Programming Principles In Scala Principles Of Reactive Programming I have been trudging on and off with Scala for the latter half of 2013. Written many small programs to understand the core concepts. But doing these two courses have put me on very firm footing. The courses had me working on 12 solid assignments. And not a single one of these took me less than couple of days. The assignments cover a lot of ground which includes - Composing non-trivial higher order functions Mixing object oriented with functional Usage of Scala collections along with language built-in\u0027s like pattern-matching Testing with ScalaTest and ScalaCheck Using RxJava and Observables on non-trivial data-set Using Akka for Actor based concurrency Inspired by these assignments, I have been working on few of my own ideas. Specifically, three projects (all of which are still in their infancy). However as I head out with family for a vacation to usher in the new year, I thought of writing this as a post-it on my web-wall. One of the new year resolutions is to invest more time and energy into these projects. GBridge Project Goal: Data bridge between Ganglia (gmond) and ZeroMQ The Why Why Ganglia? Because it is (probably) the worlds most popular open-source data collection tool for large data centres Why ZeroMQ? Because it is (probably) the worlds most popular open-source data-bus for high volumes, with API in most programming languages Specifics Ganglia\u0027s gmond agent responds with cluster wide metric health on TCP in XML. GBridge polls this data GBridge can collect data from multiple clusters and any or random host within the cluster GBridge is optimised for minimum polling of gmond Each metric is published only once (and as a separate message) per polling cycle on ZeroMQ Each metric is published as JSON Use actor based concurrency and futures for polling multiple gmond nodes, parsing response and publishing on ZeroMQ Completely in Scala Graceful degradation on load. Support distribution, automatic recovery on errors and failover Going ahead support Collectd on data ingress side. Support writing to OpenTSDB on the data egress side Code Status Coded the data collection, parsing and publish to ZeroMQ Tested only for small loads Very little unit test code Yet to design for distribution, recovery and failover ScaLog Project Goal: Jekyll or PoetJS like markdown based static blogger in Scala The Why Scala lends better for server side coding. Learn and implement a full-stack web application in Scala For larger blogs, features like full text search can be much faster in Scala than Ruby or JavaScript Apart from human-readable HTML interface, also provide a machine-readable RESTful interface Option to store the markdown in flat files on the server side or source it from a RDBMS (PostgreSQL) Specifics ScaLog uses Spray for HTTP Server side (for both RESTful interface and HTML pages) ScaLog uses pegdown for markdown processing ScaLog uses Slick to read and write to RDBMS from Scala (ORM like) Cloud platforms for applications like Heroku are the main target for deployment Code Status The code for CRUD (post/get/put/delete) operations for the blog with RESTful URLs is complete up to proof-of-concept The code for CRUD at the database layer also done pegdown parsing of markdown complete Work needed to easily extend the URLs, support UI templating and much more WebFlow Project Goal: NetFlow like UDP export of ingress-egress data at Web-Servers The Why Gone are the C10K problems. We are now in the world of C10M and beyond. With such high volume of connections, to account for all the request-responses hitting the web-servers, it is not sufficient to use polling based (JMX like) or log based mechanisms. Dictionary export mechanisms are valid contenders when the volumes are so large All the good reasons of why Netflow/Sflow are wonderful methods for volume accounting (at high volumes) at switch/router level Specifics Plug-in for Jetty/Netty/Spray/Servlet containers Completely Scala. Akka actor based Code Status Design done. Yet to start coding"
}, {
title: "Why Learn Scala?",
url: "https://bharath12345.github.io/posts/why-learn-scala/",
date: "Wed 11 December 2013",
category: "posts",
content: "It was a long time ago that I read this masterpiece by the software engineering guru, Peter Norvig - Teach Yourself Programming In Ten Years. Peter advises wannabe programmers to learn at least half a dozen programming languages. Taking stock of myself earlier this year I realised having terribly missed out. In my decade long career, I have worked deeply in only 4 languages - C++, Java, JavaScript and Perl. And none of them strongly functional per Peter\u0027s advise (functional JavaScript hasn\u0027t come to me yet). This led me to pose two questions to myself - Table of Contents Scala Is A Great Mix Haskell and Scala Erlang and Scala C#, Java and Scala Scala Ecosystem Unlearning and Relearning Programming Why do I need another language? If I have to pick one, then, which one? The answer to the first question came to me rather quickly. At that time I was exploring what was new with JVM-7. And what was expected of Java and JVM-8. JVM-7 with its invoke dynamic and Java-8 with lambdas were clearly pointing the finger in a certain direction. I realised the designers of the JVM had started to embrace polyglot and functional programming. Digging deeper, the reasons for this move were easy to realise. Java\u0027s issues with type-safety, lack of immutable collections (in the JDK), rampant usage of shared mutability etc., were beginning to weigh heavy. The distributed, multicore, big-data computing, realtime world were making Java a little too verbose, justifying the need to look for alternatives. Surprisingly, the second question turned out to be the tougher of the two. The choice essentially was between Groovy, Scala and Clojure. I chose Scala. My pre-learning decision has got richly rewarded by what I have learnt after taking the plunge to Scala. Even as I continue to make the (sometimes) steep climb, this is a small, humble attempt to articulate the amazing things I have learnt. This write-up is a little too theoretical. For show-me-the-code types, I will soon write about a not-so-small 3-tier (DB \u003c=\u003e Biz \u003c=\u003e UI) application I have built entirely with Scala. In this post I allude to three broad reasons to Why Learn Scala - Scala is a great mix. It imbibes some of the best features of other popular, successful languages Scala ecosystem of frameworks/libraries is big, mature, created by some great people from academia/industry and very well documented and supported Some features of Scala that have made me a more thoughtful, better programmer Scala Is A Great Mix Scala brings many new original ideas with it for a Java programmer. New ideas like the implementation of persistent data-structures on the JVM, mixing object-oriented with functional, deconstructing objects with pattern matching and many refreshing ideas to lessen code verbosity. But given its academic roots, these were expected. Whats nice is that Scala also brings with itself some of the best features from at least 4 other popular, well designed programming languages - Haskell Erlang C# Java Now to quickly dig into what it brings from each of these. Haskell and Scala These are two interesting Hammer Principle surveys - Learning This Language Improved My Ability As A Programmer Learning This Language Significantly Changed How I Use Other Languages Haskell tops both these lists. Scala, at its very core, incorporates a lot of Haskell\u0027s good features into itself. Here is a short quick list - Type Inference First Class Functions Currying Lazy Evaluation List Comprehensions Immutability Algebraic Data Types Higher Order Types Monads One question that begs an answer - If Haskell is so good, then why not use Haskell itself? Why go for Scala? If thats an option then I would definitely encourage the reader to go ahead. But to those like me who love and trust the JVM, want interoperability with Java for its ecosystem of libraries and have a overarching/indefinite need for platform independency, Scala is a welcome choice. Erlang and Scala WhatsApp gets more messages than Twitter. WhatsApp is built on Erlang. And thats for a reason. To handle as many messages as WhatsApp does, you need a massively concurrent application. To run a massively concurrent application, you need a lot of parallel execution. And Actor based method for concurrency brought by Erlang is built for such a usecase. It is backed by solid theory and research. However Erlang takes a thread backed method for its Actor model\u0027s concurrency. And the Erlang process is very lightweight. Erlang applications commonly have tens-of thousands of threads or more. Now thread\u0027s are a scarce resource on commodity hardware (Erlang does not always run on commodity hardware). And in a distributed, horizontally scaling setup the constraint on number of threads can be quite strict. The developers of Scala have thus provided two types of Actors: thread-based and event based. Thread based actors execute in heavyweight OS threads. They never block each other, but they don\u2019t scale to more than a few thousand actors per VM. Event-based actors are simple objects. They are very lightweight, and, like Erlang processes, you can spawn millions of them on a modern commodity machine. The difference with Erlang processes is that within each OS thread, event based actors execute sequentially without preemptive scheduling. This makes it possible for an event-based actor to block its OS thread for a long period of time (perhaps indefinitely). If one is looking to engineer a highly concurrent application on the JVM, then Scala\u0027s Actor model provides a compelling option for designing such a system. I encourage the readers to listen to the many videos/talks by the architects of Scala Actor model (like Jonas Boner and Roland Kuhn) to get a more thorough understanding of the Actor model. Scala\u0027s Akka library with its Actor model is a great effort to bring the best of Erlang\u0027s proven concurrency model to the JVM engineers. C#, Java and Scala Scala has taken a lot of good things from C# and Java, especially in the syntax area. The syntax seems to have been designed especially keeping the Java programmers in mind, all the while trying to reduce the verbosity. One of the very interesting features that seems to have been inspired by C# is Implicits. They provide a means to extend libraries, help in type conversion etc. Scala Ecosystem Scala has had its set of woes in this area. There has been quite some furore over backward compatibility of Scala\u0027s native libraries and other frameworks over the last few releases 2.7 \u003e 2.8 \u003e 2.9 \u003e 2.10 (present). The Actors model has been written multiple times over - once as native scala.actors, once as part of the Lift library, and finally as part of Akka. However, having started coding with Scala many months ago and having worked on the latest releases of many of these libraries I have felt them being no different than those that exist in the world of Java in documentation, community backup etc. One great joy is actually the existence of many options in every area of the language, which I illustrate in the list below. Concurrency, Event Management, ESB a. Akka - actor based concurrency model b. Eventsourced - persistence, recovery, redelivery of messages Data structures a. Scalaz - data Structures for functional programming b. RxJava - composing asynchronous and event-based programs using observable sequences (not written in Scala but, probably, better used with Scala than Java from code hygiene POV!) Build and Testing a. ScalaCheck - testing framework with probably no Java equivalent (at least that I know of) b. SBT - more concise than Maven. No XML crap - build instructions as Scala DSL Object Relational Mapping - a. Slick - Database access Distributed Big Data Tasks a. Finagle - Fault tolerant, protocol agnostic RPC system b. Scalding - MapReduce for Scala c. SummingBird - Streaming, continuous, real-time MapReduce on top of Scalding or Storm Web Development a. Lift b. Play! c. Spray.IO These are but just a few of the popular libraries in some of the more frequently programmed areas. There are many more options for a interested programmer in each area. For example, the number of web development frameworks in native Scala number more than 10. And then there are libraries in other areas like machine-learning etc. Unlearning and Relearning Programming For those coming from Java, with no functional programming background, Scala can be a steep learning curve. But it is well worth the effort. To me, apart from exposure to many concepts totally new, Scala has helped in getting more firmly grounded in the fundamentals of structure and interpretation of computer programs. It has helped me realise the many things I need to unlearn to become a better programmer! If a passing reader finds this claim interesting, here is a quick list of things I feel better programming at now... Immutability: tradeoffs in using the C/Java style innocuous looking for loop; Utility (and at times necessity) of immutable collections (which do not exist in Oracle\u0027s Java JDK) Type safety: strengths of Java/JVM style strict typing; Problems in Java\u0027s type safety offering Inheritance: a better understanding of covariance and contra-variance Rethinking code verbosity by composing higher order functions, partial functions etc (lesser code often translates to fewer bugs) A better way to alleviate null-checks using Options Dependency Injection without annotations or XMLs Things can be better than using static classes, methods, variables Closures and Mixin\u0027s possible on JVM too (until now, I had thought of these only from the JavaScript perspective) Using Map when I needed Tuple was not exactly a bright idea I can do so much more when I can write code that my build system understands... looking for Maven plugins need not be a way of life... ... and I can go on and on!!"
}, {
title: "Programming Is Hard To Manage",
url: "https://bharath12345.github.io/posts/programming-is-hard-to-manage/",
date: "Tue 26 November 2013",
category: "posts",
content: "Couple of recent incidents triggered me to write this one. Few weeks ago, I met an old friend. A fellow software industry man. But unlike me, a people manager. As we shared our experiences in software development, my friend picked on my recently acquired MBA. Give me something to read, my friend demanded. I promised him this blog. Table of Contents Understanding Software Development Group Level Individual level Why Study? A Quick Roundup... 1.The Mythical Man Month 2. Adrenaline Junkies and Template Zombies 3. The Inmates Are Running The Asylum 4. Hackers And Painters 5. Dreaming In Code 6. Beautiful Code 7. The Productive Programmer The second incident. A month ago I logged into my (almost) discarded yahoo-mail. And only valuable thing in that old mailbox is a folder with few musings from my early years as a software engineer. I opened the folder out of curiosity. One of the notes was titled Bewilderment. It was a list of processes, decisions and people\u0027s actions that were totally counter-intuitive to me. At the end of the piece I had advised myself to study psychology to understand things! Understanding Software Development Over the years I have searched, read and re-read books which could broaden my understanding of this wonderful enterprise that we call software development. I broadly categorise these books into two groups: Those explaining team behaviour, challenges Those that throw light on individual behaviour and advise improvement. And here is a short list of titles that I would highly recommend in these two categories - Group Level The Mythical Man Month by Dr. Fredrick Brooks Jr Adrenaline Junkies and Template Zombies by Tom DeMarco, Tim Lister et al The Inmates Are Running The Asylum by Alan Cooper Individual level Hackers and Painters by Paul Graham Dreaming In Code by Scott Rosenberg Beautiful Code articles by Brian Kerninghan, Charles Petzold, Douglas Crockford, Jeffrey Dean, Sanjay Ghemawat and many more super programmers The Productive Programmer by Neal Ford Why Study? When it comes to studying about project management, programmers and managers alike, do not necessarily get excited about reading books. Among the arguments I have heard include... Busy programmers and managers have enough on their hands to do... why add more? Don\u0027t we all learn by doing things? This can be learnt only by doing Technology has changed (whatever that means!) Software is an odd-ball industry - too new for theoretical dissection Software is too fast-paced and full of change for a scientific analysis Better to spend time on technical books from career perspective (whatever that means!) Deep down, I believe two things are absolutely critical for furthering of human cause - Books and Institutions. Neither should be undermined at the cost of other. And both are mutually dependent. A purposeful life would be one spent in either/both these endeavours. To the arguments against studying of these books, all I can do is to offer a humble suggestion - as to what I have essentially learnt from them. These books, most importantly, have helped me to articulate the difficult situations I have found myself in during software projects. Both to myself and to others. How many of us can really explain our office scenario at home? To our friends in 5 minutes? But it is the self-articulation that is probably far more important. As project-people we often sense a pattern when things are going wrong (or right). The gut feeling. But it is difficult to understand why our gut says what it does. Let me give an example from my career. My first two jobs had been at large companies with thousands of employees. Each day was a routine - an hour\u0027s drive to office, clear-cut tasks, well funded projects and large teams. My contributions often felt small and inconsequential. But it was not so also. My managers pointed this out often. So what was it that sometimes made me uneasy? The phrase that articulates that feeling most accurately is \u0027Template Zombie\u0027. A Quick Roundup... There is absolutely no way to give a quick and dirty summary of any of these books. I hold each one very highly and dearly. Worth reading multiple times. So all I do here is to share when you might want to read each one. 1.The Mythical Man Month This is a brilliant book for anyone aspiring for a lifelong career in software (like me!). Fred Brooks charts the complete territory - from programming languages to organization to design-think. And provides superbly constructed scientific arguments and thesis for all propositions. My bewilderments of workings in a big organisation were considerably answered in this book. Sample the Brooks law: Adding manpower to a late software project makes it later. 2. Adrenaline Junkies and Template Zombies Are you making a switch from a big company to a startup? Or vice versa? From a big team to a small one? Or vice versa? If so, reading this book is highly advised. 3. The Inmates Are Running The Asylum Design issues? Conflicts at workplace? Politics? You will find some delightful answers here. 4. Hackers And Painters Long long ago, in my first year at work, a close friend who was a excellent mentor and a superb programmer told me something that I will never forget. Mimic\u0027ing Amitabh Bachchan he said \u0027duniya mein sirf doh tarah ke log hote hain... ek jo programming kar paate hain... our dusre woh jo programming nahin kar paate hain...\u0027. This is a superb book if you feel like dwelling into that one! 5. Dreaming In Code This delightful book is every bit worth carrying to a vacation. 6. Beautiful Code An ideal present to a promising programmer. Such fascinating projects and such industrious engineers. Inspirational. 7. The Productive Programmer Programming fast is a real skill. Programming productively and fast is an even greater skill. This is a nice self help book for all programmers who aspire to do that."
}, {
title: "Concurrency on the JVM",
url: "https://bharath12345.github.io/posts/concurrency-on-the-jvm/",
date: "Thu 14 November 2013",
category: "posts",
content: "Over the last few months I amused myself with an interesting pursuit. I spoke to a large number of people on the aspect of concurrency. I spoke to ex-colleagues. I spoke to engineers, architects at hackathons/meetups. And I interviewed a large number of senior engineers for a job at my company. I spoke to them about building a highly-concurrent, high-volume, real-time data-aggregation engine. Gave examples of easy, textbookish projects to drive home the requirements. Like a stock trading platform with 1000s of users, 1000s of stocks and 100s of stock-exchanges. Or a IPL ticket bidding site with 1000s of users, many seating categories, many venues etc. And this small article is about my perspectives at the end of it. Programming Concurrency... Building Frustrations Globally The engineering challenge involved in building high volume concurrent applications should not be underestimated. Some veterans I spoke to suggested that such problems and multiple solution approaches have existed for ages. I shamelessly, often at the cost of personal repute (of embarrassing someone) counter-questioned to tell me the various approaches. Essentially, all, boiled down to just two - Handling all data in a single thread due to the fear of complexity: On hearing this, I would often say do u really intend to exercise just one core of your upcoming quad-core, 64-processor server? Hearing this, they would move on to the 2nd option, which is... Build the beast with thread-pools, locks and synchronisation blocks: Lets use java.util.concurrent and laugh our way to the ATM, some suggested... and I would say watch out... you could end up in a jail, a mental asylum or a bankruptcy proceeding before reaching that ATM! While interviewing candidates for the job I have been on the lookout for engineers who have few perspectives other than just these two. A small fraction, when pushed, uttered something to the extent of event-based, SEDA approaches. But when questioned further, these event based approaches often endup folding in the realm of one of the above two. I find this rather sad. Let me clarify, I do not think that the above two approaches are fundamentally wrong. But knowing just two is clearly insufficient. I also ran into an interesting few who had dabbled with NodeJS and were clearly smitten. Smitten with the action and enthusiasm of engineers in that world rather than with any hard technological breakthroughs. In this article I will not talk about NodeJS. I have briefly written about it earlier on this blog here. I dearly hope that those who suggested NodeJS did so out of their own naivet\u00e9 and my hard nudges... and do not truly believe in the NodeJS performance hyperbole! Not Java... But the JVM! Java engineers need to graduate to becoming JVM engineers. They need to internalise forever the fact that JVM has been a far bigger innovation than Java as a language. And when Java engineers will do that they will realise that the so-called competitors like NodeJS are non-starters. One book that I highly advise to those who wish to make this graduation is the super revealing \u0027Programming Concurrency on the JVM\u0027 by Dr. Venkat Subramaniam. I read this book sometime ago. Back then, I was just beginning to find my reasons to learn Scala/Clojure. Reading it filled me with the energy to know more about the JVM internals and the new world of concurrency programming. Nietzche once said \"He who has a why to live can bear almost any how\". As a programmer, my why has been concurrency, multi-core, big-data and high-performance. And Dr. Venkat gives a few how\u0027s! Broadly, the book is divided into three architectural approaches that one could take to build a concurrent application on the JVM, which are - java.util.concurrent with thread-pools, synchronization blocks, locks, fork-join etc Software Transactional Memory - being made popular by Clojure. This StackOverflow thread on real world adoption is instructive. And this paper gives the reader an excellent understanding from both Hardware and Software Transactional Memory perspectives Actor Based Concurrency - being made popular by Scala and Akka. One just needs to visit the Typesafe website to know about the rapid adoption of this model Programming Concurrency on the JVM - Dr. Venkat Subramaniam Dr. Venkat drives home the following points to those who wish to develop concurrent applications - The three options available to the designers Shared mutability... the pure evil option Isolated mutability Pure immutability Introduction to the world of persistent data structures. Here is a mind blowing thread on recent innovations in functional data structures. Many are persistent. A quick intro to the world of modern JDK concurrency mechanisms. It is in this part of the book that I found a certain treatment of the subject of concurrency that I was sorely missing. Applications has multiple \u0027needs\u0027 that drive the concurrency requirement. Broadly these needs can be divided into three parts - High Network I/O Intensity (large network I/O requirements lead to concurrency designs) High Disk I/O Intensity Large compute problems which can be broken down to smaller pieces... divide and conquer... which leads to concurrent designs The numerous code examples in the book showcase two things - Increasing complexity of code in certain approaches The time-to-compute or efficiency differential by comparing the different approaches I would love to quote a few sentences from the STM chapter of the book... (1) We\u2019ve been led down the path of the imperative style of programming with mutable state for so long that it\u2019s very hard to see alternatives to synchronization, but there are. (2) OOP didn\u2019t quite turn out to be what Alan Kay had in mind when he coined the term. His vision was primarily message passing, and he wanted to get rid of data. Somewhere along the way, OO languages started down the path of data hiding through Abstract Data Types (ADTs), binding data with procedure or combining state and behavior. (3) In the real world, the state does not change; the identity does But STM is not a silver bullet to all concurrency applications. The author clearly says - STM is suitable for concurrent reads and infrequent to reasonably frequent write collisions to the same data. Actors are a pure message passing model. Each actor has a built-in message queue. Actor library allows multiple actors to send messages concurrently. The senders are nonblocking by default. Although multiple actors may be active at any time, only one thread is active in an actor at any instance. The main drawback of this model, in my opinion, is that message passing systems with proper interleaving is not an easy art - it requires deep design thinking. Epilogue Programming concurrency is hard. Any which way. When confronted with such requirements and problems, the vocabulary used by the engineers and architects to make good design choice and find right hires is extremely critical. I was recently following some discussions on Y! Combinator on the suitability of Scala/Clojure to develop enterprise applications using such new ideas for concurrency and many other things. And I found this comment, though a little harshly worded, as food for thought... The only thing enterprise business managers want is a language that can dumb down the art of programming to a level a programmers can be managed like assembly line workers. And that is what Java does exactly, an IDE that can make a novice and expert work at the same levels of productivity, extremely verbose code that gives an illusion of people building something big(even if its down right trivial). And most importantly programming effort can be accounted like a couple of least important replaceable folks down the hierarchy doing some assembling reusable units of material. Change this scenario, a good technology with merit makes programmers very important and makes managers look like desk clerks. Enterprise Managers don\u0027t care a least about type systems, lambdas, or traits or whatever. Most managers don\u0027t have a remote clue what those things are. Can the technology enable them to manage herds of programmers dumbed down enough to be managed like sheep? That is all they care."
}, {
title: "Folding it the right way",
url: "https://bharath12345.github.io/posts/folding-it-the-right-way/",
date: "Thu 31 October 2013",
category: "posts",
content: "I have been dabbling with Scala for a few months now. And one of the things that strikes me about functional programming is the beauty of the finished code. It sometimes gives me a feeling of being just the right mix of art and science! Gone are the dirty null/empty checking if statements. Gone are the dumb variety for/while loops. I haven\u0027t progressed far enough to be using actors but the very thought that variables in my program are not getting mutated while being thrashed across many cores and caches is enough to sometimes give me a high! But this blog is about something else. I just wanted to write about a small piece of code as an example of beauty, expressiveness and correctness of the functional style. I ran into this problem as part of my Scala Coursera course. First of all neither is the problem nor the solution mine. After writing a lot of imperative style ugly code to solve the problem I got fed up with myself and searched for a better way to do it. A more functional way. Here I just explain the problem and the solution. Firstly, the problem - Write a method to compute all the subsets of list of tuples. For example, given this tuple list List((\u0027a\u0027, 2), (\u0027b\u0027, 2)) the list of all subsets is: List( List(), List((\u0027a\u0027, 1)), List((\u0027a\u0027, 2)), List((\u0027b\u0027, 1)), List((\u0027a\u0027, 1), (\u0027b\u0027, 1)), List((\u0027a\u0027, 2), (\u0027b\u0027, 1)), List((\u0027b\u0027, 2)), List((\u0027a\u0027, 1), (\u0027b\u0027, 2)), List((\u0027a\u0027, 2), (\u0027b\u0027, 2)) ) Now I request you to please try solving this. It really is not very tough. Crack up your IDE and try in the imperative style of programming. Use whatever data-structures and algorithms. Yes, you will be able to crack it, after maybe some pain. But after you are done, give that code you wrote a hard stare. And a hard stare to the functional equivalent below. It is inevitable that you will realise, how fat our coding has grown on the unhealthy monotonous diet of pure imperative thinking all the time... 1. def combinations(occurrences: List[(Char, Int)]): List[List[(Char, Int)]] = 2. (occurrences foldRight List[List[(Char, Int)]](Nil)) 3. { case ((ch,tm), acc) =\u003e 4. { 5. acc ++ ( for { 6. comb So, there you have it. About 10 lines of thin code in all its glory. Now let me get under the skin of it to show what really is happening here... First of all, Scala has the concept of tuples that helps in having cleaner data structures for problems like these. Secondly, this code (foldRight) uses currying. If you don\u0027t know about currying, that is okay. It just means that all items in a data-structure are applied on a passed function. The function passed in this case is the one that starts with the curly brace on line#3. Thirdly, this piece of code uses multiple anonymous functions. Let me describe the execution flow step-by-step - The Scala foldRight method applies the passed method on data items in the reverse. So, on passing the list List((\u0027a\u0027, 2), (\u0027b\u0027, 2)), the first data item to be used for processing is (\u0027b\u0027, 2) foldRight takes an initial accumulator. In this case it is the Nil passed in line#2 So the initial value of parameters on line#3 are: ch = \u0027b\u0027 tm = 2 acc = Nil The for expression yields two tuples on being executed. The two tuples are (\u0027b\u0027, 1) and (\u0027b\u0027, 2). These two are appended to the Nil list and we have the result after the first pass of data structure as List(List(), List((b,1)), List((b,2))) In the second pass, the data-item from our occurrences list being processed is (\u0027a\u0027, 2). So the value of parameters this time on line#3 are: ch = \u0027a\u0027 tm = 2 acc = List(List(), List((b,1)), List((b,2))) Its in this second pass, that things really get interesting. The for statement yields all 4 remaining subsets, which are List(List((a,1)), List((a,2)), List((a,1), (b,1)), List((a,2), (b,1)), List((a,1), (b,2)), List((a,2), (b,2))) in this single pass! It will take a little bit of mind bending to understand how this happens... but its definitely worth the effort... just reading it made my day! Now coming to the great thing about this program - performance! Compare the number of passes on the data-structures that this piece of code has taken to the imperative code. The first thing to really digest is that this not some algorithm trickery. Now that you know the algorithm in functional programming style, try doing it the imperative style. Firstly, the code will not look this concise. Secondly, most of us will simply not be able to do it right. But the best part - the input data structure is immutable and so are all intermediate ones. The benefit? This piece of code will not fail if some other thread of execution changes the input variable when this piece of coding is executing! (that is List[(Char, Int)] occurrences data structure). Why? Because it is impossible to change the input data structure! It is born immutable. It will live immutable. And it will die immutable. Nothing ever can come in its way! Unfortunately, this algorithm implementation is such that output from the first iteration gets fed in the second iteration. So two parallel cores cannot be running it simultaneously. However, with all intermediate data structures being 100% immutable, it is not difficult to imagine other problems/algorithms which do not have this constraint thus using up more cores at once and built for distribution and performance! I hope you share my wow(!) about this piece of code and functional programming in this case."
}, {
title: "Algorithms Course-1 With Prof Sidgewick on Coursera",
url: "https://bharath12345.github.io/posts/algorithms-course-i-with-prof-sidgewick-on-coursera/",
date: "Tue 08 October 2013",
category: "posts",
content: "I did my engineering in electronics and communication systems. But my very first job was in software development. Having not studied theory of computing, databases, compilers and even algorithms/data-strutures as part of my graduation I went on to self-study these. However, deep down, have felt the need for more structured education. I don\u0027t remember when I first heard of Coursera. But my early tryst with online education had been dismal (at my previous employer they would make me go through online training\u0027s mandatorily\u2026 and those used to absolutely suck). So even as I kept track of the courses offered on Coursera since early this year, I did not enroll. A couple of months ago I decided to give it a serious try\u2026 and I enrolled myself for the first course on Algorithms by Professor Robert Sidgewick. I finished my final exam on the course yesterday. And it feels great to be done with all tests and programming assignments. The course was structured in the undergraduate training way\u2026 which is exactly what I wanted. The learning has been enormous. Anyone who has spent a decade in software development like me would know MergeSort and QuickSort anyway\u2026 but the scientific treatment of the subject both in the videos and the textbook gives me a sense of closure. And by the way, I think algorithms and data-structures is a field which a practicing engineer has to seriously brush-up, once in every few years, just to keep up\u2026 Table of Contents General questions on the study of Algorithms Sorting Searching Like the few book reviews that I have done before on my blog, this is a quick refresher for myself on all that I have studied. Its not complete or thorough. And I hope there are no factual errors. So if a passing reader finds anything here useful, it makes me glad\u2026 General questions on the study of Algorithms Why study Algorithms and Data Structures? Why are they important? Computers, no matter how powerful, have space and time constraints. Poorly thought through implementations for computing problems can take years to compute even when computing resources are massive. For example - Why learn, re-learn algorithms? The primary reason to do so is that we are faced, all too often, with completely new computing environments (hardware and software) with new features that old implementations may not use to best advantage As a professional, it is a crime to use tools without their thorough understanding. So as Java programmers, to use HashMap and TreeSet without the knowledge of the underlying resource utilisation and performance impact is\u2026 Intellectually satisfying How do you measure how long will your program take to run? Repeated runs in thousands to find the mean and standard-deviation Run it for different quantum\u0027s of input data \u0027N\u0027 - find mean and std-dev for different N after thousands of runs Find a relationship between N and time-taken by plotting on a graph - is the graph linear? hyperbolic? logarithmic? Why measure how long programs take to run? Knowing the order of growth of the running time of an algorithm provides precisely the information that you need to understand limitations on the size of the problems that you can solve. Developing such understanding is the most important reason to study performance. What are big-O and big-Omega notations? Why are they needed? big-O is for the upper bound. big-Omega is for the lower bound. (there is also a big-Theta that is a little more involved idea). The running times of a great many programs depend only on a small subset of their instructions - so when running time of algorithms are proportional to squares(N2) or cubes(N3) or exponentials(2N) of input data counts (N), we know that these algorithms will not scale for large inputs (N). Only when running times of algorithms are proportional to linear(N), linearithmic(NlogN) or logarithmic(logN) or constant can they be expected to scale for large inputs. Then why is big-O not useful for predicting performance or for comparing algorithms? The primary reason is that it describes only an upper bound on the running time. Actual performance might be much better. The running time of an algorithm might be both O(N2) and ~ a N log N. As a result, it cannot be used to justify tests like our doubling ratio test (see Proposition C on page 193). What is the base of log when we are talking about complexities of algorithms? Why? Base-2. In terms of Big-O, the base doesn\u0027t matter because the change of base formula implies that it\u0027s only a constant factor difference. That is logarithms from base 10 or base 2 or base e can be exchanged (transformed) to any other base with the addition of a constant. The critical thing to understand is that logarithms (of any base) increase slowly with the increase of N. However, observe this table of log values\u2026 (with respect to complexity of algorithms, the value of N can never be fractional or negative anyway...) What does Java Arrays.sort() implement? Mergesort till Java6. TimSort from Java7 onwards... Order of growth graph? Here is the log-log plot (both size(N) x-axis and time(T) y-axis are in logarithms) Example of each - constant time - assignment statement logarithmic - binary search linear - find the maximum value linearithmic - merge sort quadratic - double for/while loop cubic - triple for/while loop exponential - brute force search Why develop faster algorithms? Faster algorithms help us to address larger problems Why study memory utilisation of Java programs? If you have 1GB of memory on your computer (1 billion bytes), you cannot fit more than about 32 mil- lion int values or 16 million double values in memory at any one time. How many bytes in memory are required to store a reference to a Java Object? 4 bytes on a 32 bit system. 8 bytes on a 64 bit system Sorting In Java what do you have to do to be able to sort an array of a custom object type? The class of the object should implement Comparable Performance of selection sort N2/2 compares and N exchanges About selection sort it takes about as long to run selection sort for an array that is already in order or for an array with all keys equal as it does for a randomly-ordered array! Data movement is minimal Performance of insertion sort Insertion sort uses N2/4 compares and N2/4 exchanges to sort a randomly ordered array of length N with distinct keys, on the average. The worst case is N2/2 compares and N2/2 exchanges and the best case is N-1 compares and 0 exchanges. Performance of merge sort Top-down and bottom-up mergesort uses between 1\u20442NlogN and NlogN compares to sort any array of length N. Top-down mergesort uses at most 6NlogN array accesses to sort an array of length N. The primary drawback of mergesort is that it requires extra space proportional to N Upper limits to compare based sorting algorithms Compare-based algorithms that make their decisions about items only on the basis of comparing keys. A compare-based algorithm can do an arbitrary amount of computation between compares, but cannot get any information about a key except by comparing it with another one. No compare-based sorting algorithm can guarantee to sort N items with fewer than log(N!) ~ NlogN compares. Performance of quick sort The quicksort algorithm\u2019s desirable features are that it is in-place (uses only a small auxiliary stack) and uses ~ 2NlogN compares and one-sixth that many ex- changes on the average to sort an array of length N with distinct keys. Whats the problem statement for priority queues? insert and remove the maximum have to be fast. Provide fast insert and access to a subset of data points among potentially infinite number of data points. Binary heaps provide the data structure to implement logarithmic time insert and remove-max. (Java natively provides a PriorityQueue implementation as part of collections) What is a binary heap? In a binary heap, the keys are stored in an array such that each key is guaranteed to be larger than (or equal to) the keys at two other specific positions. In turn, each of those keys must be larger than (or equal to) two additional keys, and so forth. The largest key in a heap-ordered binary tree is found at the root. Generally binary heaps are stored sequentially within an array by putting the nodes in level order, with the root at position 1, its children at positions 2 and 3, their children in positions 4, 5, 6, and 7, and so on. Performance of Priorty queues with binary heaps? In an N-key priority queue, the heap algorithms require no more than 1 + log N compares for insert and no more than 2logN compares for remove the maximum. Performance of heap sort Heapsort is significant because it is the only method that is optimal (within a constant factor) in its use of both time and space\u2014it is guaranteed to use ~2NlogN compares and constant extra space in the worst case. When space is very tight (for example, in an embedded system or on a low-cost mobile device) it is popular because it can be implemented with just a few dozen lines (even in machine code) while still providing optimal performance. However, it is rarely used in typical applications on modern systems because it has poor cache (processor cache) performance: array entries are rarely compared with nearby array entries, so the number of cache misses is far higher than for quicksort, mergesort, and even shellsort, where most compares are with nearby entries. Application of PriorityQueue TopN by some particular order of prioritization. If you are looking for the top ten entries among a billion items, do you really want to sort a billion-entry array? With a priority queue, you can do it with a ten-entry priority queue. When to use Java Comparable and when the Comparator? Implementing Comparable means implementing the compareTo method which is supposed to show the natural ordering in a set of objects of a certain type. There are many applications where we want to use differ- ent orders for the objects that we are sorting, depending on the situation. The Java Comparator interface allows us to build multiple orders within a single class. It has a single public method compare() that compares two objects. If we have a data type that implements this interface, we can pass a Comparator to sort(). In typical applications,items have multiple instance variables that might need to serve as sort keys. The Comparator mechanism is precisely what we need to allow this flexibility. Can comparators be used with PriorityQueues as well? Yes. See http://stackoverflow.com/questions/683041/java-how-do-i-use-a-priorityqueue When is a sorting method stable? If it preserves the relative order of equal keys in the array. Read the beautiful example on page 341 Which sorting algorithms are stable? Only insertion sort and merge sort are stable Searching Popular data-structures to hold symbol tables? Binary search trees, Red black trees and hash tables Performance of brute force sequential search (unordered arrays or linked lists) Search misses and insertions in an (unordered) linked-list symbol table having N key-value pairs both require N compares, and search hits N compares in the worst case. In particular, inserting N distinct keys into an initially empty linked-list symbol table uses ~N2/2 compares. One useful measure is to compute the total cost of searching for all of the keys in the table, divided by N - for sequential search this is N/2 Performance of binary search for symbol tables Binary search in an ordered array with N keys uses no more than logN + 1 compares for a search (successful or unsuccessful). But inserting a new key into an or- dered array of size N uses ~ 2N array accesses in the worst case, so inserting N keys into an initially empty table uses ~ N2 array accesses in the worst case. Performance of BST (binary search trees) for symbol tables Search hits in a BST built from N random keys require ~ 1.39logN compares, on the average. Insertions and search misses in a BST built from N random keys require ~ 1.39logN compares, on the average. Shortcoming of BST The running times of algorithms on binary search trees depend on the shapes of the trees, which, in turn, CS depend on the order in which keys are inserted. In the best case, a tree with N nodes could be perfectly balanced, with ~ logN nodes between the root and each null link. In the worst case there could be N nodes on the search path. So to optimise, keys are inserted in random by purpose to tilt towards the average case search performance Performance of 2-3 Search trees Search and insert operations in a 2-3 tree with N keys are guaran- teed to visit at most logN nodes. Performance of Red-Black BST Why use hashing? To be able to handle more complicated keys (custom objects, strings) What are the two popular ways to hash collision resolution? Separate chaining - bag of items for each hash key Linear probing - also known as Open addressing In Java, which is faster - HashSet or TreeSet? What is the usecase for each? HashSet Almost constant time performance due to the usage hash functions for the basic operations (add, remove, contains and size) does not guarantee that the order of elements will remain constant over time TreeSet guarantees log(n) time cost for the basic operations (add, remove and contains) guarantees that elements of set will be sorted (ascending, natural, or the one specified by you via it\u0027s constructor) offers a few handy methods to deal with the ordered set like first(), last(), headSet(), and tailSet() etc Internally uses a implementation close to Red-Black Trees Common features Being sets, both offer duplicate-free collection of elements It is generally faster to add elements to the HashSet and then convert the collection to a TreeSet for a duplicate-free sorted traversal None of these implementation are synchronized Java also has a LinkedHashSet - look it up to know about it more In Java, which is faster - HashMap or TreeMap? What is the usecase for each? On similar lines as HashSet vs. TreeSet. HashMap implements a hash function (uses hashCode and equals) on the keys. TreeMap uses Red-Black trees internally. HashMap is more time efficient. TreeMap is more space efficient. TreeMap has an internal ordering of keys which can also be specified using a construction time comparator. HashMap\u0027s have no internal ordering. One should use HashMap for fast lookup and TreeMap for sorted iteration. HashMap allows null keys and values. HashMap doesn\u0027t allow duplicate entries. HashMap iteration performance depends on initial capacity and load factor that can be passed during construction - TreeMap offers no such iteration performance tunables. Why is order not maintained in Hash* collection implementations? The whole point of hashing is to uniformly disperse the keys, so any order in the keys is lost when hashing. In Java, what is the rule with implementing hashCode? If hashCode\u0027s are equal then objects may or may not be equal If hashCode\u0027s are not-equal the objects are not equal In Java, what kind of collision resolution scheme is implemented for HashMap and Hashtable? Both use separate chaining. Google guava libraries have some implementations for linear probing Space usage of BST vs. separate chaining vs. linear probing? Performance of hashing vis-a-vis trees? What would be a good data-structure to use for counting all people within a income range (say 10k to 20k) in an age group (say 25 to 35 years) among a million people? Kd-trees because of the easy 2-dimensional split (at least one should say some kind of tree). Though Kd-trees can be used for n-dimensional searches very well too"
}, {
title: "Weekend well spent with JSFoo \u0026 NodeJS",
url: "https://bharath12345.github.io/posts/weekend-well-spent-with-jsfoo-nodejs/",
date: "Tue 24 September 2013",
category: "posts",
content: "Had been to the wonderful JavaScript conference JSFoo last week. The tremendous enthusiasm in the web development community for server side JavaScript was all at display. Personally, I have spent a lot of time coding visualizations with JavaScript. However only recently did I write some tidbits of code with NodeJS. And I hadn\u0027t spent any time properly studying it. The conference has spurred me to do better. I started reading NodeJS and working on a small project to create my first non-trivial NodeJS application (which I shall share in this blog). But those details are for a little later\u2026 let me start with JSFoo\u2026 Table of Contents JSFoo The making of two blogs\u2026 bharathwrites.in bharathblogs.in JSFoo Me being a little bit of computer science fundamentalist have looked upon those doing web development with a little skepticism. Do they really get the data-structures? Complexity of algorithms? Well, those those questions were put to rest by the many speakers at the conference forever. NodeJS might not be in production in a big way as of now. But there is no doubting the quality of people behind it. The design and frameworks are still in the makes\u2026 but quality people from the developer community are lapping it up. And the industry is not to be left behind\u2026 Of all, Microsoft and Adobe were among those sponsoring the event - Microsoft (with its .Net dream fading slowly) was busy showing off IE10 while Adobe seems to be on its way to burying flash and flex with investments to build open-source JavaScript frameworks\u2026 For me the conference started on a beautiful note. It had to do with the Mozilla foundation. It was awesome of Mozilla to have brought in such a wonderful contingent for the meet. My first love has always been Firefox. I do all my development on Firefox. Coming to know that Chris Heilmann was among those in the hall made me smile within myself. Now coming to the few talks that will stay with me\u2026 Robert Nyman was among the first speakers. He spoke on the upcoming FirefoxOS for mobile devices. I had my first brush with FirefoxOS at the Wikipedia hackathon and have spent some time with it. Android definitely needs another open-source competitor. With a JavaScript API platform, one hopes, FirefoxOS will catch on with the larger community of web developers. The next step for smartphones is to be able to support 1000s of lightweight apps. I hope that race gets kickstarted with FirefoxOS (there is already a nice \u0027search app\u0027 facility in FirefoxOS which tells me they have their marker in the right direction!) I was really looking forward to listening to Christian Heilmann. And his choice of topic did not let me down. In his talk he urged the web developers to study HTML5. Developers continue to use shims and jQuery plugin\u0027s unnecessarily - the features they look for have made their way into the specs and should be available by default (full screen API as a case for point). Browsers are claiming HTML5 support without fully implementing the specs - and in this situation it becomes the job of the developers to pound on the doors of the browser developers (file bugs) if any part of the spec is unimplemented or glossed over. Personally, let me admit - I have never read a book on HTML5 (for that matter I don\u0027t remember if I have ever read any book on HTML at all). If someone had suggested reading a book on HTML5 before this talk I would have responded by saying that I find the W3C resources on the web quite sufficient. But now, after listening to Chris, I know why my thinking is wrong. One of the talks that blew my mind was that by Nilesh Trivedi on Interactive Physics Simulations. I would have to watch the video of Nilesh\u0027s talk many times over to grasp all that he said. And to build the application that he has without using any pre-built frameworks is absolutely astounding! If you are a C/Java programmer with a liking for theory of computing like me, then, there you have it - there are people like Nilesh in the JavaScript world! The two workshops that I went to were both superbly conducted. Bharani Muthukumarswamy introduced me to MeteorJS (and made me promise to myself to try it soon). And in the other workshop Pankaj Bhageria made me construct the server side of a JavaScript app step-by-step. Both made me code. And I enjoyed it thoroughly. Other good talks included the one by Om Shankar on WebRTC, Offline Apps by Manan Bharara, Persona based authentication system (newly being brought by Mozilla) by Francois Marier and the preview of developer tools for the upcoming IE10 by Rajasekharan Vengalil. Before I end my note on JSFoo I must express my Thanks to HasGeek. I had been to the Fifth Elephant few months ago and now JSFoo. I must congratulate them for filling what was a definite need among the developer community. Yes we now have www.meetup.com and other hackathons happening ever more regularly. But the Indian software community and developers in particular need more interaction. I have come to learn about so many wonderful small companies and people through these two conferences that I have lost count. Thank you HasGeek! The making of two blogs\u2026 A decade ago I had a blogspot blog. I used it for a couple of years. Then got tired of it and created a new one on WordPress. But never felt like writing anything of substance on it. With my hacker like attitude (even back then), I always detested the way these blogs looked, the URL itself and many such things. Buying web-server space for hosting a blog felt plain wrong. So a year ago when I came to know of GitHub Pages I decided to try it as soon as I could. GitHub is the ideal platform for engineers to blog. Jekyll is super easy to learn. And for those with version control in their bloodstream and daydreams, Git feels so nice. So I bought my domain name (for less than Rs. 200!) and got started a couple of months ago. And though the blog is not close to what I want it to look like, it still feels so much better than blogspot\u2026 But then that was till last week. One thing that I did not like with GitHub Pages was Ruby. I don\u0027t know Ruby. And I have no inclination to learn it. So when I had to understand Gems and Rake it did not feel good. When I got a couple of error emails from GitHub saying that the blog build had failed, it felt worse (though the problems itself were trivial to fix)\u2026 I knew I could use Heroku and host the blog as a NodeJS application while simultaneously putting it in on GitHub. That would give me server side control. And a SQL database! And a NoSql database!! So last weekend, while attending the conference, I let the urge to take me over. I started chipping away with my first NodeJS blog app\u2026 it is in fairly good shape now\u2026 and so\u2026 I am happy to present - http://bharathblogs.in!! So what sense does it make to have two blogs? None. So what I am going to do? Keep both! Well, the domain name costs nothing. (And I like to build backup plans with my applications!)\u2026 The thing is, I have built both and the code is almost identical. So why dismantle either anyway...? Now here is a quick primer to how and what of building both these... bharathwrites.in The components - GitHub pages for hosting Jekyll for static blog generation Grunt for JS minify (see the Gruntfile.js for complete list of tasks) Twitter Bootstrap, FontAwesome for the blog\u0027s look and feel Posts use various JavaScript frameworks like Dojo, jQuery, Angular, D3, Stack etc bharathblogs.in The components - Heroku for hosting GitHub for version control NodeJS as the server side platform Poet as the blogging framework Grunt for JS minify (see the Gruntfile.js for complete list of tasks) Twitter Bootstrap, FontAwesome for the blog\u0027s look and feel (etc) Posts use various JavaScript frameworks like Dojo, jQuery, Angular, D3, Stack etc Now I plan to slowly add more functionality to the server-side of my NodeJS blog app. Probably scrape a few RESTful data sources on the web that is of interest to me and hopefully that of my visitors. Start using the Heroku provided MongoDB. And so on\u2026 one sad thing is Heroku does not support WebSockets\u2026 Else I had couple of interesting ideas for that one. (And one of these days I will probably swap bharathwrites.in to be hosted from Heroku and bharathblogs.in from GitHub pages\u2026 want to hack on my NodeJS blog a lot and I like the bharathwrites.in url better)"
}, {
title: "The Bleeding Edge Of A Web Application...",
url: "https://bharath12345.github.io/posts/the-bleeding-edge-of-an-application/",
date: "Wed 11 September 2013",
category: "posts",
content: "Most web applications have the well-known 3-tiered structure - WebTier \u003e ApplicationTier \u003e DataTier. Both WebTier and ApplicationTier have the web-layer to parse the incoming HTTP requests. Its in the WebTier that one deploy\u0027s load-balancing L4-routers like Apache/Nginx or Netscaler like appliances. HTTP requests are forwarded by the WebTier to the ApplicationTier which is generally served by a much bigger farm of servers. Web-layer in the ApplicationTier is the focus of this blog. Its a challenging area of software development for the following reasons and more - Table of Contents 1. Quantifying the \u0027Bleeding Edge\u0027 2. Why Is It Hard? 3. Software Development Of Web Applications 4. JVM Based Web Apps (i) Servlet Specification Frameworks (ii) MVC Frameworks (iii) Asynchronous Event-Driven Frameworks 5. NodeJS - JavaScript on the server side 6. Ruby and PHP 7. What to use for my project? Huge volume of requests, with read requests generally surpassing write by an order of magnitude or so Change. Website content and web-service APIs both change very often Variety of consumers. People read/write to the web. And so do other software applications Being a Java and JavaScript developer, my interest has been in the emergent software stacks in these two languages. To understand their raison d\u0027\u00eatre. For that, I start by taking a look at the numbers (HTTP requests) at some of the popular websites. Then move on to some of the core technical problems. And compare some of the competing software stacks. But before discussing on the web-layer in the ApplicationTier it is instructive to look at the pure WebTier itself. Its instructive to read Netcraft\u0027s September 2013 Web Server Survey. All the top web-servers are C/C++ based. For those unfamiliar with actual web application deployments, these web-servers are not used to host the applications themselves. They serve static pages, act as L4-routers, firewalls and load-balancers. They are placed at the very gate of modern web-shops and all requests go through them. These tasks are well defined, so, it makes sense to develop them in native languages for brute speed. 1. Quantifying the \u0027Bleeding Edge\u0027 Here are the numbers from recently published articles on Twitter, WhatsApp and Facebook. There are others who cannot not be far behind like Google, Wikipedia, Amazon, Skype etc. Twitter: 300K requests per second (RPS) for reading and 6000 RPS for writing - source1, source2 WhatsApp: 10 billion requests sent and received in one day - source Facebook: 12 million HTTP requests per second - source (All these articles are quite recent) 2. Why Is It Hard? Two good resources to start understanding why these scales are hard on software development in ApplicationTier are - C10K problem by Kegel and Felix von Leitner C10M problem by Robert Graham. And this video by him is very instructive But let me state the problem(s) simply. The reasons why it is hard to handle HTTP requests are - Forking a process: is too expensive a compute operation to perform everytime a request arrives Forking a thread: is less expensive on compute. But writing multi-threaded applications for multi-core systems is very tough (and actually forking a new thread is not inexpensive at all) Use thread pools: It just shifts the bottleneck. Once you have a thread-pool, each thread has to do a select() or poll() to find the next nonblocking socket ready for IO. But doing a select() or poll() on a huge array of open socket descriptors is extremely inefficient at the kernel level (checkout the deep analysis to C10K problem in the above mentioned links) The Event driven model: requires a paradigm shift in thinking and designing applications from bottoms-up. The best way to start grasping the idea is to read The Reactive Manifesto. This model is not very different from the SEDA architecture. Reactive applications is a very fine idea and one of the reasons why I dwelled into this subject in the first place\u2026 3. Software Development Of Web Applications My current views are that, broadly, there are 3 different language families to develop web applications on server and client sides - Server: JVM based, compiled and statically typed; Client: JavaScript Server: Ruby/PHP, interpreted and dynamically typed; Client: JavaScript Server: NodeJS, interpreted and dynamically typed; Client: JavaScript Thus, on the server side, the choice is between JVM (polyglot), Ruby/PHP and NodeJS. 4. JVM Based Web Apps The web-layer in JVM world is filled with 3 types of frameworks - Frameworks that support the servlet specification (latest one is 3.0) MVC frameworks Asynchronous event-driven frameworks based on Netty (i) Servlet Specification Frameworks These include Tomcat and Jetty. What is the main motivator for the servlet spec? It is to manage state information that does not exist in the stateless HTTP protocol. HttpServletRequest provides an API getSession() where the HttpSession object is a container to hold attributes for a single transaction spread across multiple HTTP request/responses. Apart from this central feature of sessions, the servlet API also defines the interfaces that the servlet container has to adhere-to to provide concurrent request processing in a sandbox environment. So is there a drawback in this idea? Yes, there is. The very idea of state brings down the performance of these containers. That is the reason why developing highly performant RESTful APIs using servlet containers is a bad idea. In RESTful, the client is expected to maintain the state, if required. Servlet containers can be tuned for statelessness, but then, that goes against one of the fundamental ideas of the spec. And my readings tell me that these frameworks don\u0027t become highly performant on turning off the statefullness. (ii) MVC Frameworks These include Spring MVC, Struts, Tapestry, Wicket etc. I have used two of these - Struts2 and Wicket in building applications that have seen deployment. The fundamental motivation for these frameworks is ease-of-development (annotations etc), clean separation of concerns (MVC design pattern), lot of goodies (like templating etc) and integration with other JavaEE stacks (Struts2-Spring integration etc). (iii) Asynchronous Event-Driven Frameworks And now I come to the most interesting area of Java web application development. Netty based frameworks like Play! and Vert.x. These frameworks do not comply to the servlet specification. They use Netty underneath for asynchronous event based handling of HTTP requests (I cover what-the-hell-is-asynchronous-event-driven in the NodeJS section below). Netty is stateless making the server side fast and efficient. The frameworks on top are built to match the ease-of-dev and richness offered by the likes of Struts and Tapestry. They also offer APIs for client-side statefulness. So these frameworks are an effort to mix high performance with ease-of-dev. But moving to event-based and asynchronous thinking is not straightforward. It needs a mind shift akin to the transition to Object-oriented-programming. However the promise they hold is to be able to build web applications that defy Amdahl\u0027s law. If you are a new shop with bright Java engineers wanting to build a highly scalable web-application, then, these are the frameworks you should start exploring first... 5. NodeJS - JavaScript on the server side The list of companies and websites powered by NodeJS is growing long by the day. However NodeJS is still a newbie. Why would somebody want to use NodeJS? NodeJS makes two very interesting promises - * End-to-end JavaScript shop for you web application * High performance through event-based asynchronous model The first promise is easy to understand. Any good web application requires a team of good designers and client-side programmers. If the programming language on both client-side and server-side are the same, then it reduces the risk of investment in diverse technologies and brings down the barriers between teams and moving people. The second promise of performance is more interesting. Is NodeJS as fast as the Java based async frameworks? This blog presents an excellent comparison. It goes to show that NodeJS is no match to the JVM based frameworks. Its difficult to beat the JVM! But moving ahead of comparisons, let me dwell a little more on the aspect of performance promised by the event-driven asynchronous frameworks in general. The hype around such frameworks is increasing day-by-day and is grounded on firm theoretical foundations. So how exactly does async and event-driven help? NodeJS provides a good base to explore since one cannot do anything but asynchronous event-based HTTP processing with NodeJS! Let us study this code fragment for a while - (this comes from this excellent book on NodeJS by O\u0027reilly) // load http module var http = require(\u0027http\u0027); var fs = require(\u0027fs\u0027); // create http server http.createServer(function (req, res) { // open and read in a file fs.readFile(\u0027textfile.txt\u0027, \u0027utf8\u0027, function(err, data) { res.writeHead(200, {\u0027Content-Type\u0027: \u0027text/plain\u0027}); if (err) { res.write(\u0027Could not find or open file for reading\\n\u0027); } else { // if no error, write file to client res.write(data); } res.end(); }); }).listen(8124, function() { console.log(\u0027bound to port 8124\u0027); }); console.log(\u0027Server running on 8124/\u0027); Following aspects need to be understood - The two instances of asynchronous behaviour - one for http I/O and the other for file I/O This program never blocks. NEVER. Multiple types of events are emitted - when a request arrives, when file I/O request completes - and these events are consumed in a single giant event loop with the NodeJS framework Large (N-squared) like compute algorithms should not be synchronously attempted - they take away all the processing core\u0027s bandwidth bringing the whole system to a halt. So, such event-based asynchronous processing is most suited for applications that can be broken down into multiple stages like a SEDA architecture The application itself acts as one giant event-producing and event-consuming engine which should be seen as single-threaded and binding to a single-core. To make use of multiple-cores multiple-instances of NodeJS can be run on the same system NodeJS has found tremendous traction with developer community. Am heading to JSFoo in Bangalore next week, and one look at the funnel would tell you that every second session has something to do with NodeJS. And NodeJS has a plethora of MVC frameworks which are maturing fast. Sample - Express, Geddy, FlatironJS, EmberJS - these are definitely poised to give MVC frameworks in Ruby and PHP a run for their money in simplicity, performance and features. 6. Ruby and PHP With JVM based frameworks occupying one end of the spectrum offering high-performance + Maintainability and NodeJS based frameworks at the other end with simplicity + low-cost + ease-of-dev, how much of middle-ground is left for PHP and Ruby? I am not an expert in either of these two, so I will stay away from making predictions. One thing that is in favour of PHP/Ruby is that both are proven in large production applications while reactive Java frameworks and NodeJS are still not. How long will this status last? Will NodeJS and Java reactive frameworks take away a chunk of web applications that would otherwise have been Ruby/PHP\u0027s? Or will the web applications playing field get expanded with the entry of these new players creating room for all? 7. What to use for my project? I roundoff my blog with a guidance, albeit reluctantly. Apart from the usual suspects of time-to-market, capex-opex investment, engineering-skill and requirements-complexity that make project delivery complex, I propose that few more criterions come into play when it comes to web applications - Is the web application part of a packaged-product (bundled in a CDROM) or delivered as part of the general web or Saas? Is the web application intra-enterprise or for open-internet usage? Is the web application majorly for human consumption or accessed by other software services? I leave those aspects to the good judgement of the readers. I would definitely like to get feedback on those who disagree from my guidance below. The idea of writing this guidance is to paint broad strokes\u2026 Exceptions among project/people always exist! JVM Based Performance: JVM is worlds best VM in performance. Ruby/PHP/NodeJS are interpreted and don\u0027t come close in performance (doing anything in JRuby per me is simply a bad idea). Facebook created HipHop for PHP to make it scale - this counts as an exception. Twitter, LinkedIn shifted from Ruby to Scala (which is JVM based) and achieved higher performance numbers. One can find umpteen examples like this\u2026 Development Time: Java and other JVM languages are slower to develop compared to Ruby/PHP/NodeJS. And thats the reason why frameworks like Play! are trying hard to sell themselves as suited for fast development Cost: Java developers are more expensive Suited for: Large web applications. Enterprise products. Mission critical applications Ruby, PHP Performance: Definitely not bad Development Time: Fast Cost: Medium Suited for: Medium sized projects NodeJS Performance: The jury is still out. Does the Google V8 engine challenge and beat PHP/Ruby? It will never be able to match the JVM though. Development Time: Fast Cost: Low, since the whole application is built on a single language stack the server-side developers and client-side developers can co-work Suited for: Smaller chatty applications"
}, {
title: "Application Topology Graphs - Usecase, Different Product Offerings, Prototype Using D3 and jsPlumb",
url: "https://bharath12345.github.io/posts/topology-graphs-with-d3-and-jsplumb/",
date: "Sun 01 September 2013",
category: "posts",
content: "Graph depictions are common for problems like computer networks, social networks etc. Sometime ago, I came across the use-case of graphs for software application topologies. This post covers the few things I discovered on the topic of application topologies and their graphical representation. Table of Contents Usecase Prototype About these graphs Comparison APM Products APM Product Screenshots Concluding Thoughts Usecase One aspect that makes application topologies a challenge is that they are logical and not physical. That is, the boundaries of a distributed application are difficult to define. In web companies/banks it is usual to find one application-owner just responsible for the database system while many other for applications that make use of the database. The database thus becomes a shared application with multiple owners/users. From the point of view of graphical representation of applications in such an enterprise, the representation of running application thus becomes \u0027logical\u0027 - one user might want to see his application topology include the database while another may not. The database-system application owner might want to see graphs that show just the clustered databases with their external interfaces and/or graphs which include the in-enterprise applications. Thus depending upon the organization hierarchies different application components may be required to be grouped differently (both hardware resources components like servers and software components like application servers ). Inter and intra application views are required. And different users and user groups may require different layer-transitions starting with a view of their application of ownership - both drilling-in and drilling-out - through the maze of applications and its constituents. Many Application Performance Management (APM) products claim to provide such graphical views. I take a look at their offerings in a later section in this blog where I look at the application graph views of popular APM vendors like AppDynamics, OpTier etc Prototype Before getting too deep into thinking about application graphs I decided to develop a prototype for such graph representations. Unsurprisingly, just after a few hours of looking through the world of JavaScript discovered multiple libraries capable of good graph rendering. This StackOverflow thread is useful. One can buy good commercial graph rendering libraries like yFiles or use open-source freewares like GraphDracula, mxGraph etc. But the libraries that I was most impressed with were D3 and jsPlumb. I have played with D3 for over a year now and it is the most exciting JavaScript library for me on this planet! The very paradigm of data-modeling and programming for D3 is enlightening and it provides for extremely vivid and smooth graphical representations of all kind. And coming to jsPlumb, just a visit to the website is good enough to excite any programmer of its potential. So I got cracking with D3 and jsPlumb. Below are the two graph prototypes I came up with (it did not take much of an effort to code these using help from existing code available on web). The code for these prototypes are available on my GitHub repository too. I have used Dojo for modularising the code (AMD way) and draw up the containers. Note: The interactive D3 and jsPlumb graph prototypes that were originally embedded here require JavaScript libraries that are not yet available in this version of the site. The descriptions and comparisons below provide details about these prototypes. \u00a0 About these graphs D3 Graph jsPlumb Graph This is more like a inter-applications and application-group view The two different types of icons stand for single-applications (orange text) and application-groups (blue text) The graph can be dragged and zoomed. To drag/pan click on the graph and drag it. To zoom use the mouse scroller The graph actually represents a set of interconnected applications and application-groups With D3 it is not very difficult to add hover effect on nodes and links atop such a graph. It is not difficult to add color effects to application nodes and edges to signify status The icons, text and links are all SVG - so they scale beautifully on zooming Every refresh of the page leads to a re-rendering of the graph in a different way. This is so because the graph is rendered using [D3 Force directed graph](http://bl.ocks.org/mbostock/4062045) layout. The position of nodes and edges is not fixed but computed each time the page is rendered by the algorithm for the specified gravity, distance and charge configurations (this prototype is not a thorough job of getting the nitty-gritty of a force layout with D3 right for the best possible rendering within the coordinates of a box. Thoughtful tuning of parameters should make the graph good for all form factors and far better than I show here) This is more like a intra-application view This depicts a typical web-application with its 3-tiers: web-layer, app-layer and datasource (database, external etc) jsPlumb provides many different types of connectors and endpoints. After playing with the options for a while I have the left the connections to look like \u0027Z\u0027 simply because it looked nice to me! (the more appropriate links would probably be straight lines, but this is just a playful prototype!). Have chosen the source endpoint of the connections to have a blue dot. The connections have an arrow on top (there are many choices for such settings) Mouse-over the links to see the color change from yellow to blue - this is just using a simple css setting To differentiate the 3-layers, I have internally used Dojo Titlepane\u0027s. I have a liking for their neat rendering The icons are SVG. Did not try to implement zoom, pan or node/link movement. They are very much doable though non-trivial Comparison D3 jsPlumb Scalability D3 is built for scalability of visual components. Hundreds and thousands of nodes and edges can be quickly created/updated/removed and the visualizations render and transition really fast (I did a quick scale test of close to 5000 nodes and few hundred thousand edges - one has to really see to believe how fast the rendering is) jsPlumb is much slower than D3 in rendering. However that does not mean jsPlumb is slow - D3 is simply too fast! Layouts: Force etc D3 has pre-built [force layout](https://github.com/mbostock/d3/wiki/Force-Layout) visualization with many options. A force directed graph works beautifully when the real-estate available for rendering is dynamic along with a (probable) huge number of nodes and edges. The graph layout optimizes itself (per gravity/distance/charge settings) to provide the best possible view jsPlumb provides endpoints and connectors. One can use the facilities to build a force directed graph but such rendering algorithms are not provided OOTB (coding a force layout algorithm is not trivial). However if the number of edges and nodes is known, is not very huge and falls into a clean pattern (like the 3-layers in the above graph), jsPlumb can be used to create very neat layouts Visual Beauty Requires programming. One can search and lookup upteen amazing D3 visualizations including many that are graphs. One can use SVG for scalable zooming. However, building a beautiful graph framework for a product with D3 will require some work Even the default setting can produce excellent looking graphs. Building better looking graphs (with fewer elements) should be considerably easier with jsPlumb Development Simplicity D3 takes some learning. The paradigm of create/update/destroy of elements along with modelling of json data for a particular library function can be complex. But once the mind gets used to the paradigm one realizes its power and simplicity. Compared to all the JS visualization frameworks that I have used (Dojo, jQuery, Raphael, mootools, YUI, Google toolkit, FusionCharts etc) D3 is in a class of its own. Once you get hooked to creating charts/visuals the D3 way, I bet you wont go near anything else! jsPlumb is truly simple. As a well thought out, well written and well documented library, one can start building working graphs in less than a day (which would be quite a challenge for D3 newbie to accomplish) Rendering Speed No other JS framework in that I have come across comes even in the vicinity of D3 in speed and performance. D3 is a class act. Definitely not slow Layer transitions, Panning, Zoom D3 is built for zoom, pan like functionality from bottoms-up. The transitions are smooth, fast and just work Requires some doing Project Liveness, Community, Future roadmap Super active. Its one of the most cloned projects in the JS world on GitHub. There is a large community of users and questions are quickly answered on StackOverflow, Google groups etc. With such strong foundations, I dont see the momentum behind D3 slowing down in near future Not as hot as D3 but nevertheless very popular. Enjoys a fairly large community of users and in the tradition of jQuery plugin\u0027s one can easily see, understand, tweak the library\u0027s code which seems straightforward to understand for good developers on a demanding projects APM Products Along with trying to understand application topologies and design this prototype I had a look at the offerings of some of the APM vendors. All top vendors advertise topology graphs but their offerings seem very limited - lot of constraints to both configuration and usage. One can see the screenshots from these products below. After looking at the existing offerings and my study, here is a dump of features that would be required for anyone attempting the challenge of application topology views - The nodes in the topology are representative of the hardware/software components. The links in the topology are representative of transactions. Corresponding status by colouring is much required Multi-level Application Groups are needed Heterogeneous groups of applications and application-groups with customizable drill-throughs makes a lot of sense Generally, in actual deployments n:n mapping between Application and Application-group is \u0027soft\u0027 or \u0027tag-like\u0027. Application ownership and deployment structure often keeps changing. Users thus want to easily create new application-groups and add/remove applications from existing groups (all the time). This calls for a very flexible model the kind of which is not to be seen in existing product offerings Different users would want to see application topology\u0027s with different applications and groups in them. Since the whole idea of Application Topology is logical and per a particular user\u0027s world-view (and not something physical) - a user would have multiple topology views with some applications and groups present in many. Example, a user could define - Topology Layer \u0027A\u0027 with 2 applications - \u0027CRM\u0027, \u0027Core\u0027 - and 2 application groups - \u0027InternalBusinessApps\u0027, \u0027InternalOperationsApps\u0027 Topology Layer \u0027B\u0027 with 1 application - \u0027Core\u0027 and 4 application groups - \u0027InternalBusinessApps\u0027, \u0027InternalOperationsApps\u0027, \u0027CustomerFacingApp\u0027, \u0027CriticalInterfacingApps\u0027 So now, between Layer \u0027A\u0027 and Layer \u0027B\u0027 there is one overlapping application and 2 overlapping application-groups \u0027Transactions, both intra and inter application, are typically HTTP(s), TCP, web-services, RMI/RPC etc Users may require links in different layers to have a configurable set of transactions mapped on them. Going back to the earlier example of layers \u0027A\u0027 and \u0027B\u0027 - the link between CRM and InternalBusinessApps in layer-A can be configured to show the status per a configured set of Transactions, say TxA and TxB. While the link between the same CRM and InternalBusinessApps in layer-B can be configured to show the status per TxB and TxC Users may require nodes in different layers to have a configurable set of hardware/software components mapped on them. Going back to the earlier example of layers \u0027A\u0027 and \u0027B\u0027 - the node for CRM in layer-A can be configured to show the status per a configured set of Components, say ServerA and DatabaseB. While the same CRM in layer-B can be configured to show the status per DatabaseB and AppServerC Once a user defines multiple layers of topology he needs to stitch the transition. This transition stitching is a very complex requirement. Apart from it being a configurable option, this action requires a default which will show a topology layer of all individual application constituents of a Application Group APM Product Screenshots AppDynamics OpTier CorreIssue IBM ExtraHop \u00a0 Concluding Thoughts Application topology is a wonderful emerging playground for those interested in graph representations. Its very fluid with many usecases and user expectations. Applications in enterprises are getting more and more distributed with more moving parts and complexity (while probably, computer networks in enterprises is progressively getting simplified thanks to bigger routers and switches!) thus making the problem of graphing them very exciting and challenging Open-source, liberally licensed JavaScript graphing toolkits like D3 and jsPlumb have really come of age to be used deep and wide in software products. Sufficiently interested and skilled programmers can do as good a job with these libraries as what is possible by using commercial packages like yFiles \u00a0"
}, {
title: "Effective Java",
url: "https://bharath12345.github.io/posts/effective-java/",
date: "Thu 22 August 2013",
category: "posts",
content: "I read this beautifully written article a few days ago - \"I will not do your tech interview\". I can\u0027t agree more with the author. Every single time I have had to give/take a technical interview, more than the sense of being inadequately prepared I feel like carrying an inexplicable psychological burden. And I have met no one who does not fear what Ellis beautifully calls as - \"bear-trap of a stupid brainteaser\" :-). Table of Contents Creating and Destorying Objects The Java methods common to all objects Classes and Interfaces Generics Enums and Annotations Methods General Programming Exceptions Concurrency Serialization End Node In the years to come, internet is definitely going to give more and more relief to competent engineers. Having a GitHub repository with a dump of one\u0027s pet technology prototypes, having a StackOverflow point score, well articulated tweets and maybe even well-written technology blog (read this by Nathan Marz) will pay dividends to engineers continuously at work to sharpen their axe\u2026 But then, my current reality is a reality. And I have to take technical interviews as part of my job. And hiring the right people is so much more important for a small company - many times it is the only differentiator between success and failure of the company itself. So with the job\u0027s being dished out being so important, technical interviews are not supposed to be easy. Both for the interviewee and the interviewer. Pressed into the interviewing job, I felt the need to brush-up my fundamentals. This post is from my re-read of Joshua Bloch\u0027s classic - \"Effective Java\" - from a interviewer\u0027s perspective\u2026 trying to quickly refresh the elementary concepts to myself. It aint coherent or complete\u2026 will keep adding stuff to this post over time as I realise what questions really make the cut. There are plenty of interview-questions blogs and books out there - but I felt, instead of quizzing a candidate on some corner case of the JVM or language (which many times the interviewer himself might have realised just hours before the interview), it would be more honest/ethical on my part to quiz in what are well-known and real-world areas of programming for an aspiring engineer - and \u0027Effective Java\u0027 is precisely the guide for such a setting\u2026 Now planning to write few more blogs like these in the days to come\u2026 one surely on Design Patterns by GoF. Maybe one on JavaScript\u0027s good parts per Doughlas Crockford. And time permitting, few more\u2026 Creating and Destorying Objects Consider static factory methods instead of constructors Similar to flyweight. valueof/of/getInstance/newInstance/getType/newType Consider a builder when faced with many constructor parameters Telescoping constructors are hard to read and write. Inconsistent state partway through the construction Enforce the singleton property with a private constructor or an enum type All instance fields should be transient. Provide a readResolve() method else serialization/deserialization can lead to new objects Enforce non-instantiability with a private constructor Avoid creating unnecessary objects A statement like this in a for loop can lead to huge number of unnecessary objects getting created - String s = new String(\"stringette\"); The improved version is simply the following: String s = \"stringette\"; This version uses a single String instance, rather than creating a new one each time it is executed. Furthermore, it is guaranteed that the object will be reused by any other code running in the same virtual machine that happens to con- tain the same string literal The static factory method Boolean.valueOf(String) is almost always preferable to the constructor Boolean(String) Eliminate obsolete object references Spot the memory leak in this program? public class Stack { private Object[] elements; private int size = 0; private static final int DEFAULT_INITIAL_CAPACITY = 16; public Stack() { elements = new Object[DEFAULT_INITIAL_CAPACITY]; } public void push(Object e) { ensureCapacity(); elements[size++] = e; } public Object pop() { if (size == 0) throw new EmptyStackException(); return elements[--size]; } /** * Ensure space for at least one more element, roughly * doubling the capacity each time the array needs to grow. */ private void ensureCapacity() { if (elements.length == size) elements = Arrays.copyOf(elements, 2 * size + 1); } } Avoid finalizers What is a finalizer? Is it always called by the GC? Is there a performance penalty to using finalizer? Why? The Java methods common to all objects Obey the general contract when overriding equals() 1. When do you override equals()? When a class has a notion of logical equality that differs from mere object identity, and a superclass has not already overridden equals to implement the desired behavior. 2. What are the main rules that you would follow to implement equals()? Use == to check for same reference Use instanceof to check if the agrument is of the correct type Match all significant fields of the two objects Symmetric? Transivitve? Consistent? override hashCode() Always override hashCode() when you override equals() 1. If two objects are equal according to the equals (Object) method, then calling the hashCode method on each of the two objects must produce the same integer result. 2. It is not required that if two objects are unequal according to the equals (Object) method, then calling the hashCode method on each of the two objects must produce distinct integer results. However, the programmer should be aware that producing distinct integer results for unequal objects may improve the performance of hash tables. 3. How will you compute the hashCode()? Do not be tempted to exclude significant parts of an object from the hash code computation to improve performance Always override toString() Override clone() judiciously 1. Does Cloneable interface have a clone() method? Why not? Because the Java Object\u0027s clone() method (which is protected) is supposed to be used 2. How does Java Object\u0027s clone() method work? If a class implements Cloneable, Object\u2019s clone method returns a field-by-field copy of the object; otherwise it throws CloneNotSupportedException 3. What are the 3 rules for implementing Cloneable? a. x.clone() != x b. x.clone().getClass() == x.getClass() c. x.clone().equals(x) 4. How to clone properly? All classes that implement Cloneable should override clone with a public method whose return type is the class itself. This method should first call super.clone and then fix any fields that need to be fixed. Typically, this means copying any mutable objects that comprise the internal \u201cdeep structure\u201d of the object being cloned, and replacing the clone\u2019s references to these objects with ref- erences to the copies. While these internal copies can generally be made by call- ing clone recursively, this is not always the best approach. If the class contains only primitive fields or references to immutable objects, then it is probably the case that no fields need to be fixed. 5. How come interfaces like Cloneable and Serializable have no methods? Why do they exist at all then? How does JVM use them? The UID and custom readers/writers are accessed via reflection. Serializable serves as a marker to the JRE/JVM, which may take action(s) based on its presence. Refer to http://en.wikipedia.org/wiki/Marker_interface_pattern. An example of the application of marker interfaces from the Java programming language is the Serializable interface. A class implements this interface to indicate that its non-transient data members can be written to an ObjectOutputStream. The ObjectOutputStream private method writeObject() contains a series of instanceof tests to determine writeability, one of which looks for the Serializable interface. If any of these tests fails, the method throws a NotSerializableException. Consider implementing Comparable 1. What is the use of the Comparable interface? Helps in sorting when there is a natural order among the objects 2. Whats the difference between interfaces like Comparable and those like Cloneable/Serializable? Classes and Interfaces Minimize the accessibility of classes and members What is package-private? How do you implement? The member is accessible from any class in the package where it is declared. Technically known as default access, this is the access lev- el you get if no access modifier is specified. In public classes, use public classes not public fields Minimize mutability 1. Is it good or bad to minimize mutability? why? If objects are immutable they are automatically thread-safe and no synchronization or locking is required 2. How would you make an object immutable? No mutators - no setters Class cant be extended - class should be marked final Make all fields final Make all fields private Ensure exclusive access to any mutable components getters should return a new instance of the object Favor composition over inheritance Design and document for inheritance or else prohibit it Prefer interfaces to abstract classes Why are interfaces better than abstract classes? Existing classes can be easily retrofitted to implement a new interface Interfaces are ideal for defining mixins Interfaces allow the construction of nonhierarchical type frameworks. Interfaces enable safe, powerful functionality enhancements combine the virtues of interfaces and abstract classes by providing an abstract skeletal implementation class to go with each nontrivial interface that you export Use interfaces only to define types Is \u0027constants\u0027 in an interface a good programming pattern? No. Prefer class hierarchies to tagged classes Use function objects to represent strategies Favor static member classes over nonstatic 1. What are the 4 kinds of nested classes? a. static member classes b. nonstatic member classes c. anonymous classes d. local classes 2. When will you make a nested class static? If an instance of a nested class can exist in isolation from an instance of its enclosing class, then the nested class must be a static member class: it is impossible to create an instance of a nonstatic member class without an enclosing instance. If you declare a member class that does not require access to an enclosing instance, always put the static modifier in its declaration 3. Why would one prefer static classes? The association between a nonstatic member class instance and its enclosing instance is established when the former is created; it cannot be modified thereafter. Storing this reference costs time and space, and can result in the enclosing instance being retained when it would otherwise be eligible for garbage collection Generics Dont use raw types in new code 1. What is the problem with doing private final Collection stamps = ... ; Loss of compile time type safety 2. Is List.class legal? What will it give me? It is not legal Eliminate unchecked warnings How do you eliminate a unchecked warning? Suppress the warning with an @SuppressWarnings(\"unchecked\") annotation. Always use the Suppress- Warnings annotation on the smallest scope possible. Prefer lists to arrays 1. If Sub is a subtype of Super, then is the array Sub[] a subtype of Super[]? Yes. Arrays are covariant. Lists are invariant 2. So which one is better? And why? Lists are better. Arrays are reified. This means that arrays know and enforce their element types at runtime. Generics, by contrast, are implemented by erasure. This means that they enforce their type constraints only at compile time and discard (or erase) their element type information at runtime. 3. Test question - This code fragment is legal but fails at runtime! - Object[] objectArray = new Long[1]; objectArray[0] = \"I don\u0027t fit in\"; // Throws ArrayStoreException But this one wont compile at all! - List\u003cObject\u003e ol = new ArrayList\u003cLong\u003e(); // Incompatible types ol.add(\"I don\u0027t fit in\"); 4. Are these legal? new List\u003cE\u003e[] new List\u003cString\u003e[] new E[] No. It is illegal to create an array of a generic type, a parameterized type, or a type parameter. Types such as E, List\u003cE\u003e, and List are technically known as non-reifiable types. Intuitively speaking, a non-reifiable type is one whose runtime representation contains less information than its compile-time representation. Favor generic types Which of these is better and why? public class Stack { private Object[] elements; public void push(Object e) { } public Object pop() { } } or public class Stack { private E[] elements; public void push(E e) { } public E pop() { } } Favor generic methods Which of these is better and why? public static Set union(Set s1, Set s2) or public static \u003cE\u003e Set\u003cE\u003e union(Set\u003cE\u003e s1, Set\u003cE\u003e s2) Use bounded wildcards to increase API flexibility What is the PECS rule or Get-and-Put principle? Bounded wildcards can be of two types - X\u003c? extends E\u003e or Y\u003c? super E\u003e PECS stands for producer-extends, consumer-super. In other words, if a parameterized type represents a T producer, use ; if it represents a T consumer, use . Consider typesafe heterogenous containers Enums and Annotations Use enums instead of int constants 1. Does enum extend Java Object? They provide high-quality implementations of all the Object methods 2. Which interfaces do enum implement? they implement Comparable and Serializable, and their serialized form is designed to withstand most changes to the enum type. 3. How would you associate data with enums? To associate data with enum constants, declare instance fields and write a constructor that takes the data and stores it in the fields. Enums are by their nature immutable, so all fields should be final 4. How would you associate a different behavior with every enum constant? using apply() Use instance fields instead of ordinals Is using ordinals a bad idea? If so, what is the option? Use instance fields Use EnumSet instead of bit fields Whats the usecase for EnumSets? Instead of bit fields which look ugly like this text.applyStyles(STYLE_BOLD | STYLE_ITALIC); one can do this - text.applyStyles(EnumSet.of(Style.BOLD, Style.ITALIC)); Use EnumMap instead of ordinal indexing It is rarely appropriate to use ordinals to index arrays: use EnumMap instead Emulate extensible enums with interfaces Prefer annotations to naming patterns 1. Any usecase you can think of for custom annotations? JUnit testing framework originally required its users to designate test methods by beginning their names with the characters test 2. Which annotation do you use most? @Override, @Deprecated, @SuppressWarnings Consistently use the Override annotation What @Override for? it indicates that the annotated method declaration overrides a declaration in a supertype Use marker interfaces to define types Methods Check parameters for validity Make defensive copies when needed Design method signatures carefully Is Map as a method parameter better or HashMap - why? Map is. This is super basic. Use overloading judiciously Use varargs judiciously Return empty arrays or collections, not nulls What is better - returning null or empty collections? Empty Collections Write doc comments for all exposed API comments General Programming Minimize the scope of local variables Prefer foreach loops to traditional for loops Know and use the libraries Avoid float and double if exact answers are required Prefer primitives to boxed primitives What makes the performance of this program bad? public static void main(String[] args) { Long sum = 0L; for (long i = 0; i Avoid strings when other types are more appropriate Beware the performance of string concatenation 1. Before 1.5, for string concatenation StringBuffer was preferred - what is it now? StringBuilder 2. What is the difference between StringBuilder and StringBuffer? StringBuider is unsynchronized - this makes it much faster. But should be used with care in concurrent programs Refer to objects by their interfaces Which one is better and why? List subscribers = new ArrayList(); ArrayList subscribers = new ArrayList(); Prefer interfaces to reflection Reflection allows one class to use another, even if the latter class did not exist when the former was compiled. So what are the problems using it? You lose all the benefits of compile-time type checking, including exception checking The code required to perform reflective access is clumsy and verbose Performance suffers Use native methods judiciously Optimize judiciously Adhere to generally accepted naming conventions Exceptions Use exceptions only for exceptional conditions Use checked exceptions for recoverable conditions and runtime exceptions for programming errors 1. What are the different types of exceptions? * Checked exceptions * Unchecked exceptions - runtime exceptions and errors 2. When would you code for checked exceptions? when the caller is can reasonably expected to recover 3. When would you throw a runtime exception? When the program is as good as dead 4. When would you throw a error? there is a strong convention that errors are reserved for use by the JVM to indicate resource defi- ciencies, invariant failures, or other conditions that make it impossible to continue execution. Given the almost universal acceptance of this convention, it\u2019s best not to implement any new Error subclasses. Therefore, all of the unchecked throw- ables you implement should subclass RuntimeException Avoid unnecessary use of checked exceptions Tell me the exceptions you know and when you would use them IllegalArgumentException - argument aint right IllegalStateException - calling a method on an object before it is properly initialized NullPointerException - someone invokes a method on a null object ConcurrentModificationException - if a object designed to be used by a single thread is being concurrently modified IndexOutOfBoundException - accessing array beyond its data length UnsupportedOperationException - object does not support a method Favor the use of standard exceptions Throw exceptions appropriate to the abstraction Document all exceptions thrown by each method Include failure-capture information in detail messages Strive for failure atomicity Dont ignore exceptions Concurrency Synchronize access to shared mutable data 1. Is writing of all primitive data types atomic in Java? reading or writing a variable is atomic unless the variable is of type long or double 2. How long would you expect this program to run? public class StopThread { private static boolean stopRequested; public static void main(String[] args) throws InterruptedException { Thread backgroundThread = new Thread(new Runnable() { public void run() { int i = 0; while (!stopRequested) i++; } }); backgroundThread.start(); TimeUnit.SECONDS.sleep(1); stopRequested = true; } } Probably permanently. The VM might do what is called hoisting, the virtual machine might transform this code: while (!done) i++; into this code: if (!done) while (true) i++; How would you correct his? 3. Is this program thread safe? Can generateSerialNumber() be called from multiple threads safely? private static volatile int nextSerialNumber = 0; public static int generateSerialNumber() { return nextSerialNumber++; } 4. What are the 4 factors that need trade-off when writing multi-threaded concurrent programs? Safety, Liveness, Efficiency, Reusability 5. Whats the tradeoff between Safety and Liveness? safety: nothing bad happens liveness: something good eventually happens 6. What is reentracy? Is Java reentrant? Yes 7. Whats the difference between ArrayList and CopyOnWriteArrayList? It is a variant of ArrayList in which all write operations are implemented by making a fresh copy of the entire underlying array. Because the internal array is never modified, iteration requires no locking and is very fast. For most uses, the performance of CopyOnWriteArrayList would be atrocious, but it\u2019s perfect for observer lists, which are rarely modified and often traversed. Avoid excessive synchronization Prefer executors and tasks to threads 1. In the post Java 1.5 world, the use of \u0027Thread\u0027 is probably not a good idea due to the availability of new functionality in java.util.concurrent - what are they? Executors and tasks 2. There are some data structures designed in Java collections specifically for concurrent usage - what are they and how do they work? ConcurrentHashMap etc 3. Why is it a bad idea to rely on Thread.yield or Java\u0027s thread priorities API? Not portable Prefer concurrency utilities to wait and notify Document thread safety Use lazy initialization judiciously Dont depend on the thread scheduler Avoid thread groups Serialization Implement serializable judiciously 1. What is serialVersionUID? Every serializable class has a unique identification number associated with it. If you do not specify this number explicitly by declaring a static final long field named serialVersionUID, the system automatically generates it at runtime by applying a complex procedure to the class. The automatically generated value is affected by the class\u2019s name, the names of the interfaces it implements, and all of its public and protected members. If you change any of these things in any way, for example, by adding a trivial convenience method, the automatically generated serial version UID changes. If you fail to declare an explicit serial version UID, compatibility will be broken, resulting in an InvalidClassException at runtime. If no serial version UID is provided, an expensive computation is required to generate one at runtime. If you ever want to make a new version of a class that is incompatible with existing versions, merely change the value in the serial version UID declaration. 2. Why should a class be made to implement Serilizable with caution? A major cost of implementing Serializable is that it decreases the flexibility to change a class\u2019s implementation once it has been released. Consider using a custom serialized form How good is Java\u0027s ObjectStream based Serialization? When would you implement your own custom serialized form? The default serialized form of an object is a reasonably efficient encoding. The default serialized form is likely to be appropriate if an object\u2019s phys- ical representation is identical to its logical content. Drawbacks - can be excessive in space consumption, not very fast, it permanently ties the exported API to the current internal representation Write readObject methods defensively For instance control prefer enum types than readResolve Consider serialization proxies instead of serialized instances End Node"
}, {
title: "System and Application health - Is there a data collection challenge at the DC?",
url: "https://bharath12345.github.io/posts/is-there-a-collection-challenge-at-the-data-center/",
date: "Tue 06 August 2013",
category: "posts",
content: "My champion-hacker friend Sumanth and I spent a little time few weeks ago digging to know if there was a data collection challenge for system and application health metrics at a typical small data center. Here is the little that we discovered... Table of Contents Usecase IT Resources Quantum of data to collect The Developer\u0027s View Is there a Data Collection \u0027Challenge\u0027? With little more scale With Ganglia But the self questioning was preceded by a phase where I came to know about open source monitoring tools like Ganglia and discovered that they were widely deployed. Ganglia in particular is very active as a development project and uses Multicast for data transmission. It caches data at all nodes within a cluster. The collection station has to communicate to only one (any) node within a cluster. I asked myself what could be the rationale behind this design. And as I looked more deeply into the world of Ganglia I discovered the amazing attention to detail - to optimise data and time without compromising accuracy or extensibility. For those wishing to understand Ganglia, this book is a must read. The chapter on case-studies in this book makes for a truly fascinating read. This paper also provides a very good intro. And then there is SNMP. SNMP was built for monitoring. One small deficiency is that not all metrics are available through SNMP. In this blog I decided to keep SNMP aside and do the analysis. I present the numbers first and my inferences later. Usecase Let me take the example of a data center for a small eCommerce startup, say, called \"WebTraveller\". Now, a little detail about this company - WebTraveller is a travel portal on the lines of Expedia but with a target market of few small-and-medium-sized enterprises in its region WebTraveller decided to build its web application using Ruby (which is going strong as No.2 on GitHub top languages - https://github.com/languages) For its many static pages and load-balancing, the IT folk at WebTraveller decided to do the time-tested thingy - use Apache HTTPD or Nginix WebTraveller has to interface with travel data providers (airlines, bus company\u0027s et al), payment gateways, advertisers etc. Let us assume a simple idealistic world where all this data comes through superbly designed RESTful interface. So, the IT folk at WebTraveller decided to publish/subscribe to this RESTful external data interface through a Java application WebTraveller uses MySql or Postgres as its database to store user info etc Analytics is important for WebTraveller to run promotions, tune resources per demand/supply and present forecasts/state-of-business to investors - so, as a policy 10% of all IT resources are ear-marked for \u0027analytics\u0027 And as a policy, no more than 5% of IT resources should be consumed by monitoring and management tools - these are overheads and should be kept to a minimum after all IT Resources Now, how much IT resources will WebTraveller require? Since I am the de facto CIO of WebTraveller and it is my first CIO job, I decide to start with a nice whole number - say 100 servers (okay, I hear you, all on cloud). Now, here is the split up of what this 100 servers are going to do\u2026 1 Servers running WebTraveller\u0027s Ruby based dynamic Web-Application 25 2 Servers sourcing WebTraveller\u0027s static page and load-balancer (httpd/nginix) 15 3 Servers running WebTraveller Java interface with its data providers 25 4 WebTraveller database servers 20 5 IT analytics 10 6 IT management/monitoring 5 - Total Servers 100 (How much am I off the mark in these assumptions? If its horrific, then please let me know and I promise to re-do this blog) Quantum of data to collect Being the CIO, I want to understand how my IT is coping. So I need data. Data on server\u0027s utilisation, database metrics, web-server metrics etc. Industry calls these various metrics as KPI - Key Performance Indicators. So KPI it will be. How many KPIs do I need to collect for each type of IT resource? # KPI Type Approximate Number of KPIs per instance Number of Instances (from the above table) Total KPIs to collect 1 Operating system level KPIs - CPU, RAM, open sockets, HDD usage, network card stats etc 5 100 500 2 Ruby web-app KPIs 10 25 250 3 Ruby web-app runs on the Rails server. KPIs that speak Rails health 10 25 250 4 Java web-app KPIs 10 25 250 5 Java web-app\u0027s use a JVM and app-server (JBoss/Glassfish/Tomcat). KPIs that speak Java platform health 10 25 250 6 HTTPD or NGINIX KPIs 10 15 150 7 Database Server KPIs 20 20 400 8 KPIs from the Analytics system (say running Hadoop) 10 10 100 - Total - - 2150 So, the approximate total number of KPIs to collect is 2150. Which is an average of about 21 KPIs to be collected from the 100 servers of WebTraveller. Now, how frequently do we want to collect this data? I as the CIO of WebTraveller want my IT to be really AGILE - which means I don\u0027t want to miss any data (especially in its initial days!). And I also want to keep it SIMPLE. So I ask my monitoring team to collect all these KPIs every minute. The Developer\u0027s View \u0027Mr. Bean\u0027 is a developer in WebTraveller\u0027s IT team. Mr. Bean\u0027s task is cut out - he has to develop the monitoring app that collects 2150 metrics every minute by polling. Being a seasoned developer, he knows for sure that to collect so many KPIs he needs to code a \u0027multi-threaded\u0027 application. So Bean decides to do some estimation. How many threads will his application need to capture 2150 KPIs every minute? First of all, what are the different methods that exist to capture these KPIs from a remote server? Here are the necessary few - JMX to collect from Java applications JDBC to collect from the databases itself RPC/RMI or SSH based log-monitoring mechanism to retrieve data from the Ruby part RPC/RMI or SSH based log-monitoring to retrieve data from HTTPD/NGINX Server level stats through remote SSH Mr. Bean calculates the response-time for various collection methods - # Collection Method Observation Mean time to collect a set of KPIs from one instance Num of servers that can be covered in 1 minute in a single thread 1 SSH SSH involves two types of time - (1) time taken for connection establishment and teardown (2) multiple commands need to be run on the remote shell, data collated and retrieved 15 seconds 60/15 =\u003e 4 servers 2 JMX Multiple JMX attributes can retrieved at once. But here again there are the 2 phases of connection and retrieval 15 seconds 60/15 =\u003e 4 servers 3 JDBC Single JDBC session can get a lot of metrics Assume 20 seconds to retrieve all 20 database server KPIs of one instance 60/20 =\u003e 3 servers 4 RPC or RMI Am not sure if multiple variable can be retrieved in a single session. Assuming its possible... 15 seconds 60/15 =\u003e 4 servers With this understanding, Mr. Bean decides on which collection technology to use for each class of KPIs (in table below). Also, Mr. Bean wants to know the number of threads his application may have to run. Mr. Bean knows that ideally, he would want to do Asynchronous collection for each of these - that is, start a request in Thread-A and retrieve the data from Thread-B when it arrives - there are many libraries that provide such Asynchronous capabilities for each of SSH, RPC, RMI, JMX, JDBC etc. However, Asynchronous communication does not lead to conservative number of threads - a thread gets forked whenever data arrives. For most conservative number of threads, a select-and-poll based method is most appropriate. The big deficiency of select-and-poll approach however is that data collection with time boundaries becomes tougher. There is no guarantee that the above mentioned mean times will always hold good. And also, the data that arrives is distributed wildly on the temporal scale. So, Mr. Bean calculates the number of threads that his application will end-up with if he takes either of the approaches - # KPI Type Data Collection Technology Number of Instances (from the above table) Number of threads for select-and-poll approach Number of threads for asynchronous approach 1 Operating system level KPIs - CPU, RAM, open sockets, HDD usage, network card stats etc SSH 100 (100 servers/4 servers per min per thread) =\u003e 25 threads 100 2 Ruby web-app KPIs RPC or RMI 25 (25/4) =\u003e 6.25 (assuming fractions in num threads is possible!) 25 3 Ruby web-app runs on the Rails server. KPIs that speak Rails health RPC or RMI 25 (25/4) =\u003e 6.25 25 4 Java web-app KPIs JMX 25 (25/4) =\u003e 6.25 25 5 Java web-app\u0027s use a JVM and app-server (JBoss/Glassfish/Tomcat). KPIs that speak Java platform health JMX 25 (25/4) =\u003e 6.25 25 6 HTTPD or NGINIX KPIs SSH 15 (15/4) =\u003e 4 15 7 Database Server KPIs JDBC 20 (20/3) =\u003e 7 20 8 KPIs from the Analytics system (say running Hadoop) SSH 10 (10/4) =\u003e 3 10 Total - - 64 threads 245 threads So the realm of number of threads to gather information from the 100 server deployment at WebTraveller is approximately between 60 to 250 threads. The following factors are pertinent - Usage of async libraries would provide a better temporal distribution and fault-safety With a select-and-poll approach, the 64 threads will be active all the time. With Asynchronous approach 245 threads are forked every minute and they end much before the minute boundary (hopefully) The number of socket descriptors required will have one-to-one correspondence with number of threads (in this case). So 64 sockets will be open at any point of time by the polling approach, while up to 245 sockets could be open at any point of time by the asynchronous approach One can always mix-and-match between polling and asynchronous data collection for different types. For example JMX can be collected in an asynchronous way while SSH can be collected by the polling methods Is there a Data Collection \u0027Challenge\u0027? The numbers say that on average 1 to 2 threads/sockets are required to collect data from each instance. This does not sound much in WebTraveller\u0027s case but one needs to pay attention to the following details - I have assumed that all data points are to be collected per minute. It could very well be that data is required at a much more granular level for certain metrics - say every 5 seconds. In which case, the number of threads and sockets would simply go up 12 times! The average number of KPI per server, at 21, is a super conservative estimate. In most production environments, this number will at least double and generally, much much higher I have not considered the challenge (if there is one) on the persistence side of things - how easy is to to store all this data in a RDBMS (or NoSql!) and design queries for real-time? And these numbers need to be coupled with the natural challenges of data collection, which are - Horizontal scalability Fault tolerance More accurate temporal distribution Tired architecture induces delay in real-time collection and storage With little more scale The situation changes considerably if we consider a data-center with 3000 servers. Even with a linear extrapolation, it would involve collection of about 65,000 data-points every minute. And in excess of 10,000 threads and sockets. With Ganglia In WebTraveller\u0027s case, Mr. Bean could potentially do one other thing. He could use Ganglia to collect the data. Each of the 5 functional groups (from the first table above) in WebTraveller\u0027s data-center could be configured as separate Ganglia clusters. This leads to - The data collector having to communicate with only 5 servers instead of 100 - because Ganglia stores the data collected in each cluster at all the nodes In each cluster, Ganglia collects data for the constituent supporting collection technology (JMX/JDBC etc). Ganglia can use UDP Multicast, thereby only 5 threads are required to collect ALL the data. Well, now, thats one huge optimisation, isn\u0027t it? On the aside however, the Ganglia clients will have to be extended to collect from diverse sources and pre-installed on all servers. Yet, the huge saving in monitoring cost is visible in a straightforward way... Total linear scalability - even if the cluster sizes go up ten times, the load on the management server does not increase at all. The payload from each cluster might go up - but that is not much cost in collection Low granularity polling - with Ganglia, very low granularity polling within each cluster does not increase the load on the monitoring server. The monitoring server can continue to receive data on minute boundaries after all The positive effects of local storage and fault tolerance that a Ganglia based monitoring can provide"
}, {
title: "Real Time Dashboard with Camel, ActiveMQ \u0026 Dojo... On JBoss7 and using JMS \u0026 WebSocket",
url: "https://bharath12345.github.io/posts/real-time-dashboard-with-camel-activemq-dojo-on-jboss-using-jms--websocket/",
date: "Thu 01 August 2013",
category: "posts",
content: "I have built real-time \u0027stock-ticker\u0027 like dashboards. There are many ways to build them. Few months ago I had the opportunity to design one freshly again for an enterprise product. I did a quick sweep at the different technology stacks that can be used to build a highly scalable (design/code and performance scalability) real-time dashboard. There are many technologies for real-time in the browser (like BlazeDS) that are either outdated or on their way out. I came across this very interesting presentation, code and blog by Charles Moulliard which I found to be a very exciting design. So I sat down to extend what Charles had done to suit my usecase. I would recommend this nice book by Apress as a good introduction to the subject of WebSockets. But before getting to the real usecase and seeing why use Camel or ActiveMQ, here is a quick primer to the different techniques one could use to build a real-time dashboard. Table of Contents Primer of Different Techniques 1. Polling Based 2. Stateful and RESTful 3. Comet 4. WebSocket Usecase Design The Why\u0027s? 1. Why Apache Camel? 2. Why ActiveMQ and not Camel\u0027s native JMS implementation? 3. Why WebSocket? 4. Why JBoss7? How to use and results My Conclusion!! Primer of Different Techniques 1. Polling Based Ajax requires a client side request to get data to the browser. So the simplest solution is to buld a client side timer based poller. Maybe use JavaScript timers like setInterval or setTimeout (or wrappers from libraries). Pro Con Simplicity If the data being polled is increasing or is large, continuous degradation in performance is natural as data is fetched and rendered each time. If the \u0027real-time\u0027 SLAs call for changes to be shown quickly ( 2. Stateful and RESTful Maintain \u0027states\u0027 at either server or client to reduce what is queried and transmit size. Actually there are two options, Client side stateful Server side stateful But REST mandates the following 2 constrains - Stateless The client\u2013server communication is further constrained by no client context being stored on the server between requests. Each request from any client contains all of the information necessary to service the request, and any session state is held in the client. Cacheable As on the World Wide Web, clients can cache responses. Responses must therefore, implicitly or explicitly, define themselves as cacheable, or not, to prevent clients reusing stale or inappropriate data in response to further requests. Well-managed caching partially or completely eliminates some client\u2013server interactions, further improving scalability and performance. Am no expert in RESTful design. But I know for sure that many implementations (especially those which have streaming in their name) relax the stateless at server constraint. So, statefulness can go thus - Client-side stateful: Client asks for only the incremental. For example a timestamp based method could be adopted by the client to get the incrementals (by doing so the timestamp becomes the \u0027state\u0027). There are some wonderful JavaScript frameworks that make state maintenance possible. One can use BackboneJS or Dojo\u0027s Observable pattern to build a store in the browser and update the UI only on the incremental changes. Combined with RESTful HTTP APIs on the server-side, one can build robust applications Server-side stateful: Server can respond with only the incremental when a request from the same client arrives. Server side HTTP API\u0027s publish incremental data of different types and filtering. A session handshake or client-subscription is required before the start (server has to maintain state for each client). Pro Con Since only the incremental \u0027delta\u0027 is in transit and re-rendered on the UI, these methods scale in performance. They are well suited for web applications where 3rd party developers could be using your data feed to build user interface or other real-timer services. Maintaining state can quickly become very complex. Multiple types of data, with different incrementals can lead to \u0027cache-mess\u0027. It leads to many many caches and really big caches. User actions like filtering add considerable complexity to the underlying infra. And despite only incrementals being in transit, it is still a request-response system, making tight SLA\u0027s ( 3. Comet Comet, Reverse-Ajax et al. are hacks and not solutions. The idea is that the browser makes an Ajax request to the server, which is kept open until the server has new data to send to the browser. Once the server has the event it wants to send, it sends it on this already open channel. And soon after getting a response the browser initiates a new long polling request in order to obtain subsequent events. Multiple frameworks exist to accomplish the job from both server and client side. But the technology is riddled with bugs, browser incompatibilities and is a total mess. 4. WebSocket Websockets are a new protocol. The protocol specifies for setting up of a full duplex communication channel between client and server on top of HTTP(S). The HTTP header from client side has a \"upgrade\" field set to websocket and \"connection\" field set to upgrade. All modern browsers support this by the new JavaScript API WebSocket(). So the question boils down to - whats the best way to handle these upgrade requests on the server side? There are upcoming frameworks like Atmosphere which interoperate with popular existing server and client frameworks promising easy adoption. Usecase A real time alerts dashboard. In any monitoring/management/analytics system, events go through multiple stages before getting transformed into an alert needing to be displayed for concerned users. Event pipelines come in many types and JMS is not uncommon. The usecase here is of such a system where event processors pick events to evaluate and filter. The SLAs for critical alerts can be very small time-periods depending on the domain. Design The image below shows the 5 components of the implementation of my usecase. The code is posted on GitHub here. 1. AsyncHttpClient: This is just a data feed. In most data-center scenario\u0027s the data-feed to IT management/analytics/monitoring services is separated by a firewall. I use Ning HTTP client - it is based on the superb Jetty NIO2 implementation and works well with JBoss. For the prototype\u0027s sake, I have taken the data itself to be just the HTTP headers. It could be anything from the payload also. And it could be from other type of sources like SNMP etc 2. AsyncHttpServer: Camel provides a Jetty NIO2 based Async Server implementation. I use that to receive the client connections and pick the data (http headers in my case). 3. JMS Broker: I use ActiveMQ. JBoss packages HornetQ natively. But ActiveMQ is by far the most popular JMS broker on planet earth. 4. Multiple JMS Topics: The data receiver can publish the received data into a chosen JMS topic (depending on the data received). The first publish is of Serializable Java POJO. The receiver on this JMS topic picks the POJO, transforms it to JSON and publishes to a different set of JMS topic\u0027s just for JSON (this is not shown in the image below but can be seen in the code). 5. Camel JMS to WebSocket Route: Camel route is used to pick data from the JSON JMS topic and post it to both - WebSocket \u0026 log file - together. A final JSON level transformation can be applied in this stage if need be. 6. JavaScript UI: A JavaScript WebSocket() connects and waits for JSON messages to appear. Received messages are shown in a grid (Dojo\u0027s GridX actually) The Why\u0027s? 1. Why Apache Camel? (1) I wanted to learn Camel (2) Apache Camel is brilliant for plumbing purposes between modules/services within an enterprise product. The number of supported components is dizzying. Despite the heavy sounding ESB word being thrown around with it I have found it quite easy to grasp and it just works like a charm! 2. Why ActiveMQ and not Camel\u0027s native JMS implementation? One of my dear friends, Sumanth, pointed this rather subtle mention on performance aspect in Camel\u0027s JMS page. http://camel.apache.org/jms.html The JMS component reuses Spring 2\u0027s JmsTemplate for sending messages. This is not ideal for use in a non-J2EE container and typically requires some caching in the JMS provider to avoid poor performance.If you intend to use Apache ActiveMQ as your Message Broker - which is a good choice as ActiveMQ rocks\u2026 Further to this, I am slowly developing an aversion to everything Spring. I opine that it is better to avoid Spring in any new development project of scale. And Camel JMS is based on Spring. So better to use ActiveMQ directly. 3. Why WebSocket? Experts in RESTful design like Bill Burke denounce WebSockets sharply. There are others who welcome it anyway. Personally, I like the idea of a full duplex channel on top of HTTP. I dont think WebSockets maybe a good idea for companies and applications to expose there data and services - which exactly is the usecase for RESTful. WebSockets quite beautifully fit within enterprise products/applications where services are consumed internally between modules/known-applications and are deployed in a distributed setup where they cross multiple DMZ. Along with the upcoming draft of HTTP 2.0 which will hopefully support - Binary Connections remain open so long as user stays on the page Multiple open streams Priorities having WebSockets will make HTTP a dependable channel for realtime! 4. Why JBoss7? In the world of open source Java, JBoss is simply the best application container around. I used the Wildfly 8.0 Alpha3 for this prototype How to use and results A \"mvn clean install\" would build the EAR which should be deployed in JBoss 7+ From the JBoss JMX Console, use the firePostRequests() operation to send HTTP client side requests (com.bharath.http.client) The snapshot of the dashboard UI - My Conclusion!! Asynchronous processing by pushing to multiple JMS topic\u0027s when combined with Apache Camel\u0027s routing and WebSocket capabilities can provide for building a truely fast and efficient events/alerts pipeline for a realtime alerts dashboard"
}, {
title: "Java and JVM 7: Slides from a quick talk",
url: "https://bharath12345.github.io/posts/java-and-jvm-7-slides-from-a-quick-talk/",
date: "Wed 31 July 2013",
category: "posts",
content: "Doing Java early in the morning makes for a good day. Got up early today to put together few slides for a talk to developer folk. Not comprehensive. May not be very accurate even. And too much opinionated. If you dont mind my ego you may click here."
}, {
title: "Build Dojo 1.7/1.8/1.9 with Maven",
url: "https://bharath12345.github.io/posts/build-dojo-1819-with-maven/",
date: "Thu 18 July 2013",
category: "posts",
content: "I have been a Dojo user for many years now. Also use many JavaScript libraries (jQuery, backbone, bootstrap, D3, highsoft) all the while but Dojo is what I really love. I would not embark on any \"professional\" development work without being armed with Dojo. But I rest my opinions and comparisons of different JS libraries for a different blog. Here the context is to \"build\" Dojo. After all every professional project should do a build of their JS - compilers like Google Closure can find bugs, obfuscate and eventually make execution faster. Table of Contents Task 1: Installing Dojo in Maven Repository and Unpack Task Task 2: Move Dojo sources Task 3: Build Dojo Task 4: The Dojo Profile Task 5: Clean the Uncompressed JavaScript Task 6: Copy Other JavaScript libraries Task 7: A fast build profile I still am mainly a Java programmer (the enterprise products I have built are predominantly in Java\u2026 time split between Java/JavaScript may be 70/30). So am used to Maven as my primary build tool. And Maven I shall use to build Dojo. Folks who have not tried to build Dojo should probably start-off by reading these two articles - Creating Builds from Dojo documentation Creating custom Dojo builds in Maven The last article is very good but slightly dated. And here is what I propose to add to it - Use dojo v1.9 (v1.8 and v1.7 with AMD should also work perfectly) I use WebStorm as my JavaScript IDE. It has excellent contextual support including that for Dojo. However, it requires Dojo to be at a constant referencable path from where it could index. Once the indexes are built, typing a \".\" after an object should show up the list of methods and variables belonging to that object. This is extremely useful for fast development Dojo builds are slow. A typical build from source download to unzip to compile to build WAR can take anywhere between 5 to 15 minutes. This can be painful and needs to be made faster Now, here is the how\u2026 Task 1: Installing Dojo in Maven Repository and Unpack Task This is no different from the Step 1 \u0026 2 in Mahieu blog. The unzipped sources are placed in src/main/js of my maven hierarchy. I dont do any renaming of this directory. Task 2: Move Dojo sources The unpack task unzips the dojo sources in \"src/main/js/dojo-release-${dojo.version}-src\" directory. This is okay but not good for repeated builds. I would like a structure like shown in the picture below - all my JS libraries under src/main/js. This structure helps in one major way - it helps my WebStorm IDE to index the JS. The Dojo JS are always in \"src/main/js\" alongwith other libraries and WebStorm understands this very well! I use antrun for its ability to run parallel copy tasks - parallelism helps in making the build much faster. And I delete the original unzipped directory at the end. \u003cplugin\u003e \u003cartifactId\u003emaven-antrun-plugin\u003c/artifactId\u003e \u003cexecutions\u003e \u003cexecution\u003e \u003cid\u003eCopy Dojo\u003c/id\u003e \u003cconfiguration\u003e \u003ctasks\u003e \u003cparallel\u003e \u003ccopy todir=\"${js-dir}/\" failonerror=\"false\"\u003e \u003cfileset dir=\"${dojoSrc}\"\u003e \u003cinclude name=\"dijit/\"/\u003e \u003c/fileset\u003e \u003c/copy\u003e \u003ccopy todir=\"${js-dir}/\" failonerror=\"false\"\u003e \u003cfileset dir=\"${dojoSrc}\"\u003e \u003cinclude name=\"dojox/\"/\u003e \u003c/fileset\u003e \u003c/copy\u003e \u003ccopy todir=\"${js-dir}/\" failonerror=\"false\"\u003e \u003cfileset dir=\"${dojoSrc}\"\u003e \u003cinclude name=\"dojo/\"/\u003e \u003c/fileset\u003e \u003c/copy\u003e \u003ccopy todir=\"${js-dir}/\" failonerror=\"false\"\u003e \u003cfileset dir=\"${dojoSrc}\"\u003e \u003cinclude name=\"util/\"/\u003e \u003c/fileset\u003e \u003c/copy\u003e \u003c/parallel\u003e \u003cdelete dir=\"${dojoSrc}\" quiet=\"true\"/\u003e \u003c/tasks\u003e \u003c/configuration\u003e \u003cphase\u003eprocess-sources\u003c/phase\u003e \u003cgoals\u003e \u003cgoal\u003erun\u003c/goal\u003e \u003c/goals\u003e \u003c/execution\u003e \u003c/executions\u003e \u003c/plugin\u003e Task 3: Build Dojo For this again, I use the antrun plugin. This build leads to creation of dojo/dijit/dojox directories under src/main/js. \u003cplugin\u003e \u003cartifactId\u003emaven-antrun-plugin\u003c/artifactId\u003e \u003cexecutions\u003e \u003cexecution\u003e \u003cid\u003eAppsOne dojo ${dojo.version} Custom Build\u003c/id\u003e \u003cphase\u003ecompile\u003c/phase\u003e \u003cconfiguration\u003e \u003ctasks\u003e \u003cparallel\u003e \u003cjava classname=\"org.mozilla.javascript.tools.shell.Main\" fork=\"true\" maxmemory=\"512m\" failonerror=\"false\" classpath=\"${shrinksafe-dir}/js.jar${path.separator}${closure-dir}/compiler.jar${path.separator}${shrinksafe-dir}/shrinksafe.jar\"\u003e \u003carg value=\"${js-dir}/dojo/dojo.js\"/\u003e \u003carg value=\"baseUrl=${js-dir}/dojo\"/\u003e \u003carg value=\"load=build\"/\u003e \u003carg line=\"--profile ${basedir}/dashboard.profile.js\"/\u003e \u003carg value=\"--release\"/\u003e \u003c/java\u003e \u003c/parallel\u003e \u003c/tasks\u003e \u003c/configuration\u003e \u003cgoals\u003e \u003cgoal\u003erun\u003c/goal\u003e \u003c/goals\u003e \u003c/execution\u003e \u003c/executions\u003e \u003c/plugin\u003e Task 4: The Dojo Profile This is the link to the profile script I use. It has a lot of comments for the reader to understand. One can find a lot of options to tune the Dojo build by specifying options in the profile. The profile specifies thus - I name my JS project as \"Dashboard\" - so I want the built artifacts to be in the target/dashboard directory Use the closure compiler I use both dgrid and gridx in my project along with its dependencies (xstyle, dbind, put-selector) - so those have to be included Including my project\u0027s JS - which are present in the \"dashboard\" directory and are AMD complying JS Finally I want to see less verbose prints on my console - so I set the logging level to SEVERE Task 5: Clean the Uncompressed JavaScript Dojo build generates minimized JS. And in the process of doing so it retains the originial JS but renames them to have \"uncompressed\" in their filenames. This is useful for debugging purposes. But surely, we dont want these uncompressed JS to be part of the built WAR. It increases the size of the WAR (at least doubles it - taking it well above 50MB!). So, a task to remove these uncompressed JS from target directory is required. This maven stub does just that - \u003cplugin\u003e \u003cartifactId\u003emaven-clean-plugin\u003c/artifactId\u003e \u003cversion\u003e2.5\u003c/version\u003e \u003cexecutions\u003e \u003cexecution\u003e \u003cid\u003eclean-js\u003c/id\u003e \u003cphase\u003eprepare-package\u003c/phase\u003e \u003cgoals\u003e \u003cgoal\u003eclean\u003c/goal\u003e \u003c/goals\u003e \u003cconfiguration\u003e \u003cfilesets\u003e \u003cfileset\u003e \u003cdirectory\u003e${release-dir}/dojo\u003c/directory\u003e \u003cincludes\u003e \u003cinclude\u003e**/*uncompressed.js\u003c/include\u003e \u003c/includes\u003e \u003cfollowSymlinks\u003etrue\u003c/followSymlinks\u003e \u003c/fileset\u003e \u003c/configuration\u003e \u003c/execution\u003e \u003c/executions\u003e \u003c/plugin\u003e Task 6: Copy Other JavaScript libraries By now, the \"target/dashboard/js\" has all the dojo sources along with project specific built in it. The next task is to copy other JS library dependencies. In my project, I typically use D3, jQuery and jsPlumb. So here is I copy them into this directory into maven\u0027s target by stub\u0027s like these - \u003cplugin\u003e \u003cartifactId\u003emaven-resources-plugin\u003c/artifactId\u003e \u003cversion\u003e2.6\u003c/version\u003e \u003cexecutions\u003e \u003cexecution\u003e \u003cid\u003ecopy-d3\u003c/id\u003e \u003cphase\u003eprocess-resources\u003c/phase\u003e \u003cgoals\u003e \u003cgoal\u003ecopy-resources\u003c/goal\u003e \u003c/goals\u003e \u003cconfiguration\u003e \u003coutputDirectory\u003e${gui.target.gui.location}/js/d3\u003c/outputDirectory\u003e \u003cresources\u003e \u003cresource\u003e \u003cdirectory\u003e${js-dir}/d3\u003c/directory\u003e \u003c/resource\u003e \u003c/resources\u003e \u003c/configuration\u003e \u003c/execution\u003e \u003c/executions\u003e \u003c/plugin\u003e Task 7: A fast build profile Dojo ZIP is upwards of 35MB in size with thousands of files. Downloading, unarchiving and moving it around makes it a heavy duty operation which is painfully slow. This makes a maven profile for faster build absolutely necessary. This profile does the following - Assumes the presence of unarchived dojo bundle in the source tree under \"src/main/js\" It thus does none of the unarchiving or file movements and starts off directly with a closure build Does not delete the dojo/dijit/dojox directories from under src/main/js after the build is complete Readers can refer to this pom.xml from one of my projects on GitHub. It has all that I have described above. Ping me if you run into any issues using my code, understanding my blog or anything else. Thanks for reading!"
}, {
title: "Few days with Apache Cassandra",
url: "https://bharath12345.github.io/posts/few-days-with-apache-cassandra/",
date: "Thu 11 July 2013",
category: "posts",
content: "Few years ago I was a product developer at a big software (but non-database) company. We were writing the v2 of a new product after a fairly successful development round of v1. For everything OLTP, we used the wonderful open-source database - Postgres. But by v2, we had new, hight-volume data like NetFlow coming in. This would have intensely tested Postgres\u0027s scalability and read/write performance. And we had some datawarehousing and OLAP requirements too. A hard look at our queries told us that column-stores would be a great-fit. Looking back, the options for a new product to store and query on massive data volumes boiled down to these few options - Table of Contents A Simple Usecase Data Volumes Fine-grained Data Coarse-grained Data Adding it all up! Before we start data modeling... Data Access methods in Cassandra SuperColumns Denormalization and Data Modeling by Queries Code Itself Data Modeling Keyspace Configuration For JVM Method metrics For JVM wide statistics Column Families in JvmMethodMetrics KEYSPACE Raw Trend Query Tables Trend Query Roll-up Tables TopN Query Tables Column Families in JvmMetricsRaw KEYSPACE Query Code Read/Write Performance Conclusion Reading Recommendations Throw more hardware: Tell the needy customer to invest more in hardware. But no one really knew how much more hardware was really going to nail it Tune, Shard, Rebuild, Redeploy: Invest in tuning our software and database for specific queries. Shard, re-model and/or do whatever that could be done by the development and implementation teams around what we had Use Oracle This did not make good business sense for a big product company - tying itself deep into Oracle CTO and architects did not think Oracle could nail the data volumes anyway (actually none of the engineers who understood the problem thought Oracle would nail it anyway!) Use column-stores like Sybase, Vertica The fact was, there were no open-source, reliable, horizontally scalable column-stores or parallel DBMS to consider. Times have improved. We now have Cassandra, HBase, Hypertable etc (MongoDB, CouchDB etc are document stores with less of modeling - here the context is of schema-full data with rich data-type support). So, I decided to try and understand Cassandra. Wanted to answer the simple question - if I were to re-live the product development scenario described above, would I choose Cassandra? So in this article I talk about my experiment with Cassandra. Here, I choose a very specific use-case to illustrate what I found - Monitoring JVM metrics in a small data center. A Simple Usecase A web company running 50 JVMs. The JVMs could be Apache-Tomcat servlet containers hosting the application Each Tomcat instance hosts 50 URLs and thereby, say, 50 front-ending servlet classes each extending HttpServlet Method metrics are collected on these servlets (through logs or bytecode instrumentation or aspect-driven). Specifically, the metrics collected - number of invocations and time-spent - just 2 method level metrics! Idea is to analyze the metrics to get insights into - how to deploy the servets servers? Are there any hotspots and, if so, where - which URL (object) is being accessed most/least? at what times? trends? and so on\u2026 Along with monitoring these specific servlet method\u0027s also keep a tab on overall application health. The number of active-threads in all JVM\u0027s. Various JVM memory parameters. A few MBean stat\u0027s. Etc\u2026 Minimum data view granularity requirements - Last 30 days - per-minute, per-hour, per-day, per-week, per-month Last 60 days - per-hour, per-day, per-week, per-month Last 180 days - per-day, per-week, per-month Last 360 days - per-week (52 weeks), per-month Last 720 days - per-month (24 months) User primarily requires \u0027trend\u0027 and \u0027topN\u0027 charts. Examples - Chart of Top-10 most invoked servlets in last 2 months at per-hour granularity Trend of three specific servlet\u0027s response-times {max, min, avg, 1st and 3rd quartile} over last 6 months plotted per day User also wants JVM wide statistics like - active threads, memory stats and datasource stats - all following the same granularities as above. Lets suppose that these combine to 6 separate metrics in all. From the querying perspective, lets say we have only 2 users in our IT Operations team who will be actively querying this data. Data Volumes Fine-grained Data JVM Method data: 50 JVMs * 50 Methods * 24 Hours in a day * 60 minutes per hour * 2 metric-types = 7.2 Million data-points per day. 7.2 Million * 30 = 216 Million data points per month JVM-wide stats: 50 JVMs * 24 Hours * 60 minutes * 6 metric-types = 432K data points per day 432K * 30 = 12.96 Million per month Coarse-grained Data This corresponds to roll-ups. Hourly, Daily, Weekly and Monthly. Hourly rollup for last 60 days JVM method data: 50 JVMs * 50 Methods * 24 Hours * 60 days * 2 metric-types = 7.2 Million data points over last 60 days. Or, 120K data points per day JVM-wide stats: 50 JVMs * 24 Hours * 60 days * 6 metric-types = 432K data points over last 60 days. Or, 7.2K data points per day Daily rollup for last 180 days JVM Method data: 50 JVMs * 50 Methods * 180 days * 2 metric-types = 900K data points in 180 days. Or, 5K data points per day JVM-wide stats: 50 JVMs * 180 days * 6 metric-types = 54K data points in 180 days. Or, 300 data points per day Weekly rollup for last 52 weeks JVM Method data: 50 JVMs * 50 Methods * 52 weeks * 2 metric-types = 260K data points over last 52 weeks. Or, 5K data points per week. Or, 700 data points per day JVM-wide stats: 50 JVMs * 52 weeks * 6 metric-types = 15.6K data points over last 52 weeks. Or, 300 data points per week. Or, 40 data points per day Monthly rollup for last 24 months JVM Method data: 50 JVMs * 50 Methods * 24 months * 2 metric-types = 120K data points for last 24 months. Or, 5K data points per month. Or, 170 data points per day JVM-wide stats: 50 JVMs * 30 days * 6 metric-types = 9000 data points per month. Or, 300 data points per month. Or 10 data points per day Adding it all up! Number of data points collected PER DAY - JVM Method data: Fine grained minute data points = 7.2 Million Hourly rollup = 120K Daily rollup = 5K Weekly rollup = 700 Monthly rollup = 170 Total (approx) = 7.32 Million JVM-wide stats: Fine grained minute data points = 432K Hourly rollup = 7.2K Daily rollup = 300 Weekly rollup = 40 Monthly rollup = 10 Total (approx) = 440K Total of totals = 7.76 Million data points per day. Or, 320K data points per hour. Or, 5500 data points per minute. Or 90 data-points per second There are couple of VERY IMPORTANT things to realize before going further - In the DBMS world, multiple data points can fit into a single row. So, 90 data-points per second translates to fewer than 90 row inserts per second. But how fewer depends on the data modeling The temporal distribution of inserts is not even. The hourly roll-up kicks in at the end of each hour. Daily roll-up at the end-of-day and so on (not considering the timezone adjustments required for roll-ups) Small-data problem? Its just a prototype!! Before we start data modeling... Data Access methods in Cassandra Predominantly, there are three ways to interact with Cassandra - Hector, Astyanax and CQL. Cassandra supports Thrift by providing an API. Hector and Astyanax use the Thrift API to talk to the DBMS. CQL3 proposes a new SQL like API. This slidedeck has CQL3 performance vis-a-vis Thrift-API by the main committer of this piece - Eric Evans. Take your pick! In this prototype, I use CQL3. SuperColumns Recent articles and blogs suggest that supercolumns are a bad design and will go away in future releases of Cassandra. So I use composite keys and not supercolumns to model the data Denormalization and Data Modeling by Queries One of the central ideas in column-stores is to model data per the queries expected. Also denormalize, that is, store multiple replicas of data if required. Both these ideas have strong theoratical backing. Let me state just two - DB schema per query requirements - One of the gurus of database design, Professor Stonebraker has suggested that in enterprise applications OLTP queries are well known in advance, few in number, and do not change often. Refer to this paper. Denormalization - RDBMS belongs to the era when storage was expensive. Its not so anymore. CPUs are far more expensive (in both ways - CapEx and OpEx ). And DB queries take CPU cycles. And a waiting user could have tangible/intangile revenue implications of web companies. All put together, model database sparsely and denormalized. Store multiple versions and replicas of data. Do anything to make queries faster! Code Itself The JBoss7 based implementation of this prototype can be found in my github repository. You will find a couple of MBean\u0027s - JvmMethodMetricsDAO and JvmMethodIdNameDAO which have the persist() and find() methods. The procedure to use this is - Build the artifact using maven - \u0027mvn clean install\u0027 at the top level directory Deploy the jim-ear.ear in JBoss\u0027s standalone/deployments Start JBoss\u0027s jconsole and you should be able to see these MBean\u0027s in the jconsole\u0027s UI Data Modeling Here are few of the broad guidelines I set and followed - One Keyspace each for both types of data (JVM methods and JVM-wide stats). Each keyspace holds raw (fine grained) and roll-up data As few strings as possible in the stores Keep row-key and columm-key string names small Many data items like JVM_ID will need a mapping table to map JVM-Name to a UUID Row Key - For fine grained, minutely data, row key is a combination of JVM_ID and date (20130628 for 28th June 2013) All roll-up tables have JVM_ID as the row key Columns for roll-up data Hourly Roll-up: 60 days, 2 months =\u003e 24 * 60 = 1440 columns Daily Roll-up: 180 days, 6 months =\u003e 180 columns Weekly Roll-up: 350 days, 50 weeks =\u003e 50 columns Monthly Roll-up: 720 days, 24 months =\u003e 24 columns Cassandra has this superb concept of tombstones and data cleanup. This can be triggered by setting a TTL field during inserts. TTL is set in seconds and I used the following setting in this prototype - Raw: 30 days =\u003e 30 * 24 * 60 * 60 =\u003e 2,592,000 Hourly Roll-up: 60 days =\u003e 2 * 2,592,000 =\u003e 5,184,000 Daily Roll-up: 180 days =\u003e 3 * 5,184,000 =\u003e 15,552,000 Weekly Roll-up: 350 days =\u003e 350 * 24 * 60 * 60 =\u003e 30,240,000 Monthly Roll-up: 720 days =\u003e 4 * 15,552,000 =\u003e 62,208,000 Keyspace Configuration For JVM Method metrics CREATE KEYSPACE JvmMethodMetrics WITH replication = {\u0027class\u0027: \u0027SimpleStrategy\u0027, \u0027replication_factor\u0027 : 1}; For JVM wide statistics CREATE KEYSPACE JvmMetrics WITH replication = {\u0027class\u0027: \u0027SimpleStrategy\u0027, \u0027replication_factor\u0027 : 1}; Column Families in JvmMethodMetrics KEYSPACE Raw Trend Query Tables CREATE TABLE JvmMethodIdNameMap ( jvm_id int, method_id int, method_name varchar, PRIMARY KEY (jvm_id) ); CREATE INDEX jvm_method_name ON JvmMethodIdNameMap (method_name); CREATE TABLE JvmMethodMetricsRaw ( jvm_id int, date varchar, day_time int, method_id int, invocations bigint, response_time float, PRIMARY KEY (jvm_id, date) ); CREATE INDEX jvm_method_id ON JvmMethodMetricsRaw (method_id); Trend Query Roll-up Tables CREATE TABLE JvmMethodMetricsHourly ( jvm_id int, hour int, method_id bigint, invocations bigint, response_time float, PRIMARY KEY (jvm_id) ); CREATE TABLE JvmMethodMetricsDaily ( jvm_id int, day int, method_id bigint, invocations bigint, response_time float, PRIMARY KEY (jvm_id) ); CREATE TABLE JvmMethodMetricsWeekly ( jvm_id int, week int, method_id bigint, invocations bigint, response_time float, PRIMARY KEY (jvm_id) ); CREATE TABLE JvmMethodMetricsMonthly ( jvm_id int, month int, method_id bigint, invocations bigint, response_time float, PRIMARY KEY (jvm_id) ); TopN Query Tables Data in these tables is kept sorted by maximum (response-time/invocations) to minimum CREATE TABLE JvmMethodTopNHourly ( jvm_id int, hour int, method_id_type varchar, // Example: 100_RT =\u003e for method 100 response-time, 103_INV =\u003e for method 103 invocation count response_time_map map\u003ctext, float\u003e, invocation_count_map map\u003ctext, long\u003e, PRIMARY KEY (jvm_id, hour) ); CREATE TABLE JvmMethodTopNDaily ( jvm_id int, day int, method_id_type varchar, response_time_map map\u003ctext, float\u003e, invocation_count_map map\u003ctext, long\u003e, PRIMARY KEY (jvm_id, hour) ); CREATE TABLE JvmMethodTopNWeekly ( jvm_id int, week int, method_id_type varchar, response_time_map map\u003ctext, float\u003e, invocation_count_map map\u003ctext, long\u003e, PRIMARY KEY (jvm_id, hour) ); CREATE TABLE JvmMethodTopNMonthly ( jvm_id int, month int, method_id_type varchar, response_time_map map\u003ctext, float\u003e, invocation_count_map map\u003ctext, long\u003e, PRIMARY KEY (jvm_id, hour) ); Column Families in JvmMetricsRaw KEYSPACE CREATE TABLE JvmMetricsRaw ( jvm_id int, date varchar, day_time int, total_live_threads int, mem_heap set\u003cbigint\u003e, // 3 data points - commited, max, used mem_nonheap set\u003cbigint\u003e, ds_freepool map\u003cint, bigint\u003e, // key is datasource_id, free pool of ds_usetime map\u003cint, bigint\u003e // threads, avg query time over 1 min PRIMARY KEY (jvm_id, date) ); Query Code CQL3 packs a QueryBuilder utility that offers some basic features. Refer to the QueryBuild JavaDocs for more info. I was able to build simple queries for \u0027select\u0027 using different \u0027where\u0027 clauses for time and ID\u0027s without much effort. I would recommend users to extend Cassandra\u0027s QueryBuilder in their DAO layer to provide model specific functionality and catch errors. The prototype offers a Entity/DAO model which can be easily understood by those familiar with JPA/Hibernate. (However I am not a fan of the many ORM frameworks that are coming up for Cassandra - the knowledge of \u0027entity\u0027 modeling is critical for performance problems which Cassandra proposes to handle. Using a Cassandra ORM framework would mean lesser knowlege of data model and consequently less performant queries. Stay away from them!) Read/Write Performance Post modeling and unit testing I ran the application on my laptop (MacBookPro 2.9GHz/8GB RAM). Since my laptop is not an ideal performance test environment (I have multiple applications running, no tuning of cassandra or JBoss) I see no point in publishing the numbers or charts. However, I was able to \u0027write\u0027 literally millions of records per minute and read them back. Since I run MySql as well on my laptop, one thing I can vouch for is that Cassandra\u0027s write performance is definitely far ahead of what I would have expected from my OOTB MySql. Conclusion Cassandra has come a long way from the 0.8 days. I did not come across any bugs working on my prototype. CQL3 and data modeling was a breeze. And there are a plethora of resources on this topic on the web. I would certainly recommend Cassandra for those looking to get a quick hang of NoSql and Column stores. If you are planning to use Cassandra as part of your application and have done the due deligence on the performance side, then, let me assure you - programming with Cassandra should not take any more time than using a ORM framework like JPA/Hibernate. And if you are like me, wanting to write a prototype then you should be able to wrap it all up from zero to running in a single working week. Ping me if you run into any issues using my code, understanding my blog or anything else. Thanks for reading! Reading Recommendations Good introduction on the subject - O\u0027Reilly\u0027s Cassandra Definitive Guide, Data Modeling - this wonderful blog by Jay Patel from Ebay Performance comparisons - this article really nails it (pay attention to the chart!)"
}, {
title: "The Visual Display of Quantitative Information",
url: "https://bharath12345.github.io/posts/the-visual-display-of-quantitative-information/",
date: "Wed 10 July 2013",
category: "posts",
content: "For the last couple of years I have been in search of theories in Data Visualization. Educate myself on the fundamentals. My search has taken me to many books and blogs. But none as remarkable as Edward Tufte book seminal work on the subject. This is a short refresher of the core concepts. Even as I write for myself, it may be of some use to a passing busy programmer. Graphical Excellence: that which gives a viewer maximum ideas in shortest time with least ink in the smallest space Graphical excellence is nearly always multivariate. Charts depicting behavior of two variables with respect to each other are always more insightful than simple time-series or progression graphs \u0027Graphical Integrity\u0027 reigns supreme. Beware of distortions. Thre representation of numbers as physically measured on the surface of the graphic itself should be directly proportional to the numerical quantities represented (as an aside, this book might be a good read on distortions!) The number of information carrying (variable) dimensions depicted should not exceed the number of dimensions in the data. Beware of area charts depicting single variable variations Maximize the data-ink ratio, within reason. Erase non-data-ink, within reason. Revise. Rethink. Moire vibrations in statistical charts are chartjunk. Gridlines, often, are chartjunk. 3D, often, is chartjunk. More than 3 colors are, often, chartjunk. Piecharts are always chartjunk. Easy graphing software is leading to more chartjunk and more amazing chartjunk Awesome examples of clarity by revision - redesigning boxplots, barcharts and my personal favorite - the super intuitive dot-dash plot combining marginal distribution with a bivariate distribution! Use coordinates and axes with thought - maximize data-ink Organize and order the flow of graphical information presented to the eye - charts should intelligently use what are known facts on cognitive abilities of human brain Balance and optimize data-density = (number of data entries)/(area of the graphic). Try to maximize it. Else shrink the graphic Pay attention to line weights Curious case of the Golden Rectangle And, finally, it was John Tukey who once said - there is no data that can be displayed in a pie chart, that cannot be displayed BETTER in some other type of chart."
}];
// Simple search function
function performSearch(query) {
query = query.toLowerCase().trim();
if (query.length < 2) {
document.getElementById('search-results').innerHTML = '<p class="text-muted">Please enter at least 2 characters to search.</p>';
return;
}
var results = [];
for (var i = 0; i < searchIndex.length; i++) {
var article = searchIndex[i];
var titleMatch = article.title.toLowerCase().indexOf(query) !== -1;
var contentMatch = article.content.toLowerCase().indexOf(query) !== -1;
if (titleMatch || contentMatch) {
// Extract snippet around the match
var snippet = '';
if (contentMatch) {
var contentLower = article.content.toLowerCase();
var matchIndex = contentLower.indexOf(query);
var start = Math.max(0, matchIndex - 100);
var end = Math.min(article.content.length, matchIndex + query.length + 100);
snippet = '...' + article.content.substring(start, end) + '...';
} else {
snippet = article.content.substring(0, 200) + '...';
}
results.push({
article: article,
snippet: snippet,
titleMatch: titleMatch
});
}
}
// Sort results (title matches first)
results.sort(function(a, b) {
if (a.titleMatch && !b.titleMatch) return -1;
if (!a.titleMatch && b.titleMatch) return 1;
return 0;
});
// Display results
var resultsHtml = '';
if (results.length === 0) {
resultsHtml = '<p class="text-muted">No results found for "' + query + '".</p>';
} else {
resultsHtml = '<p class="mb-3"><strong>' + results.length + ' result' + (results.length > 1 ? 's' : '') + ' found</strong></p>';
resultsHtml += '<ul class="listing">';
for (var i = 0; i < results.length; i++) {
var result = results[i];
resultsHtml += '<li>';
resultsHtml += '<h3><a href="' + result.article.url + '">' + result.article.title + '</a></h3>';
resultsHtml += '<section class="byline">' + result.article.date + ' | ' + result.article.category + '</section>';
resultsHtml += '<p>' + result.snippet + '</p>';
resultsHtml += '<hr>';
resultsHtml += '</li>';
}
resultsHtml += '</ul>';
}
document.getElementById('search-results').innerHTML = resultsHtml;
}
// Set up search input handler
document.addEventListener('DOMContentLoaded', function() {
var searchInput = document.getElementById('search-input');
var searchTimeout = null;
searchInput.addEventListener('input', function() {
clearTimeout(searchTimeout);
searchTimeout = setTimeout(function() {
performSearch(searchInput.value);
}, 300);
});
// Check if there's a query parameter
var urlParams = new URLSearchParams(window.location.search);
var query = urlParams.get('q');
if (query) {
searchInput.value = query;
performSearch(query);
}
});
</script>
</body>
</html>